From 04778805dd595e37e33f6ef0e02d9337a1154e09 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 13 Jan 2018 13:18:41 -0500 Subject: [PATCH] BUG/TST: assure conversions of datetimelikes for object, numeric dtypes (#19224) closes #19223 closes #12425 --- doc/source/whatsnew/v0.23.0.txt | 5 ++ pandas/_libs/tslibs/conversion.pyx | 26 +++++++++- pandas/core/dtypes/cast.py | 42 +++++++-------- pandas/core/dtypes/common.py | 8 ++- pandas/core/internals.py | 9 +++- pandas/tests/frame/test_dtypes.py | 65 ++++++++++++++++++++++++ pandas/tests/reshape/merge/test_merge.py | 24 ++++----- pandas/tests/series/test_constructors.py | 18 +++++-- pandas/tests/series/test_operators.py | 32 +++++------- 9 files changed, 164 insertions(+), 65 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3a3c2bf0c5ae4..a0205a8d64cb7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -385,6 +385,11 @@ Conversion - Bug in localization of a naive, datetime string in a ``Series`` constructor with a ``datetime64[ns, tz]`` dtype (:issue:`174151`) - :func:`Timestamp.replace` will now handle Daylight Savings transitions gracefully (:issue:`18319`) + + +- Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`) + + Indexing ^^^^^^^^ diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 11e1787cd77da..53abdd013ec37 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -29,7 +29,7 @@ from np_datetime cimport (check_dts_bounds, from util cimport (is_string_object, is_datetime64_object, - is_integer_object, is_float_object) + is_integer_object, is_float_object, is_array) from timedeltas cimport cast_from_unit from timezones cimport (is_utc, is_tzlocal, is_fixed_offset, @@ -45,6 +45,8 @@ from nattype cimport NPY_NAT, checknull_with_nat # Constants cdef int64_t DAY_NS = 86400000000000LL +NS_DTYPE = np.dtype('M8[ns]') +TD_DTYPE = np.dtype('m8[ns]') UTC = pytz.UTC @@ -73,13 +75,14 @@ cdef inline int64_t get_datetime64_nanos(object val) except? -1: return ival -def ensure_datetime64ns(ndarray arr): +def ensure_datetime64ns(ndarray arr, copy=True): """ Ensure a np.datetime64 array has dtype specifically 'datetime64[ns]' Parameters ---------- arr : ndarray + copy : boolean, default True Returns ------- @@ -104,6 +107,8 @@ def ensure_datetime64ns(ndarray arr): unit = get_datetime64_unit(arr.flat[0]) if unit == PANDAS_FR_ns: + if copy: + arr = arr.copy() result = arr else: for i in range(n): @@ -117,6 +122,23 @@ def ensure_datetime64ns(ndarray arr): return result +def ensure_timedelta64ns(ndarray arr, copy=True): + """ + Ensure a np.timedelta64 array has dtype specifically 'timedelta64[ns]' + + Parameters + ---------- + arr : ndarray + copy : boolean, default True + + Returns + ------- + result : ndarray with dtype timedelta64[ns] + + """ + return arr.astype(TD_DTYPE, copy=copy) + + def datetime_to_datetime64(ndarray[object] values): """ Convert ndarray of datetime-like objects to int64 array representing diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b3ae8aae53b35..672e60c9dcde5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -656,11 +656,15 @@ def astype_nansafe(arr, dtype, copy=True): return tslib.ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: return arr.view(dtype) - elif dtype != _NS_DTYPE: - raise TypeError("cannot astype a datetimelike from [{from_dtype}] " - "to [{to_dtype}]".format(from_dtype=arr.dtype, - to_dtype=dtype)) - return arr.astype(_NS_DTYPE) + + # allow frequency conversions + if dtype.kind == 'M': + return arr.astype(dtype) + + raise TypeError("cannot astype a datetimelike from [{from_dtype}] " + "to [{to_dtype}]".format(from_dtype=arr.dtype, + to_dtype=dtype)) + elif is_timedelta64_dtype(arr): if dtype == np.int64: return arr.view(dtype) @@ -668,21 +672,23 @@ def astype_nansafe(arr, dtype, copy=True): return tslib.ints_to_pytimedelta(arr.view(np.int64)) # in py3, timedelta64[ns] are int64 - elif ((PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or - (not PY3 and dtype != _TD_DTYPE)): + if ((PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or + (not PY3 and dtype != _TD_DTYPE)): # allow frequency conversions + # we return a float here! if dtype.kind == 'm': mask = isna(arr) result = arr.astype(dtype).astype(np.float64) result[mask] = np.nan return result + elif dtype == _TD_DTYPE: + return arr.astype(_TD_DTYPE, copy=copy) - raise TypeError("cannot astype a timedelta from [{from_dtype}] " - "to [{to_dtype}]".format(from_dtype=arr.dtype, - to_dtype=dtype)) + raise TypeError("cannot astype a timedelta from [{from_dtype}] " + "to [{to_dtype}]".format(from_dtype=arr.dtype, + to_dtype=dtype)) - return arr.astype(_TD_DTYPE) elif (np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer)): @@ -704,19 +710,7 @@ def astype_nansafe(arr, dtype, copy=True): if copy: - if arr.dtype == dtype: - return arr.copy() - - # we handle datetimelikes with pandas machinery - # to be robust to the input type - elif is_datetime64_dtype(dtype): - from pandas import to_datetime - return to_datetime(arr).values - elif is_timedelta64_dtype(dtype): - from pandas import to_timedelta - return to_timedelta(arr).values - - return arr.astype(dtype) + return arr.astype(dtype, copy=True) return arr.view(dtype) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5d6fc7487eeb5..dca9a5fde0d74 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -4,6 +4,7 @@ from pandas.compat import (string_types, text_type, binary_type, PY3, PY36) from pandas._libs import algos, lib +from pandas._libs.tslibs import conversion from .dtypes import (CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, @@ -21,8 +22,8 @@ for t in ['O', 'int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64']]) -_NS_DTYPE = np.dtype('M8[ns]') -_TD_DTYPE = np.dtype('m8[ns]') +_NS_DTYPE = conversion.NS_DTYPE +_TD_DTYPE = conversion.TD_DTYPE _INT64_DTYPE = np.dtype(np.int64) # oh the troubles to reduce import time @@ -31,6 +32,9 @@ _ensure_float64 = algos.ensure_float64 _ensure_float32 = algos.ensure_float32 +_ensure_datetime64ns = conversion.ensure_datetime64ns +_ensure_timedelta64ns = conversion.ensure_timedelta64ns + def _ensure_float(arr): """ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5a4778ae4e629..3c923133477df 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -631,7 +631,7 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, values = astype_nansafe(values.ravel(), dtype, copy=True) values = values.reshape(self.shape) - newb = make_block(values, placement=self.mgr_locs, dtype=dtype, + newb = make_block(values, placement=self.mgr_locs, klass=klass) except: if errors == 'raise': @@ -1954,6 +1954,13 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): _can_hold_na = True is_numeric = False + def __init__(self, values, placement, fastpath=False, **kwargs): + if values.dtype != _TD_DTYPE: + values = conversion.ensure_timedelta64ns(values) + + super(TimeDeltaBlock, self).__init__(values, fastpath=True, + placement=placement, **kwargs) + @property def _box_func(self): return lambda x: tslib.Timedelta(x, unit='ns') diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 21c028e634bc0..70eee5984b438 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -640,6 +640,71 @@ def test_astype_categoricaldtype_class_raises(self, cls): with tm.assert_raises_regex(TypeError, xpr): df['A'].astype(cls) + @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) + @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): + # tests all units from numeric origination + # gh-19223 / gh-12425 + dtype = "{}[{}]".format(dtype, unit) + arr = np.array([[1, 2, 3]], dtype=arr_dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + def test_astype_to_datetime_unit(self, unit): + # tests all units from datetime origination + # gh-19223 + dtype = "M8[{}]".format(unit) + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ['ns']) + def test_astype_to_timedelta_unit_ns(self, unit): + # preserver the timedelta conversion + # gh-19223 + dtype = "m8[{}]".format(unit) + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D']) + def test_astype_to_timedelta_unit(self, unit): + # coerce to float + # gh-19223 + dtype = "m8[{}]".format(unit) + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(df.values.astype(dtype).astype(float)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + def test_astype_to_incorrect_datetimelike(self, unit): + # trying to astype a m to a M, or vice-versa + # gh-19224 + dtype = "M8[{}]".format(unit) + other = "m8[{}]".format(unit) + + df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) + with pytest.raises(TypeError): + df.astype(other) + + df = DataFrame(np.array([[1, 2, 3]], dtype=other)) + with pytest.raises(TypeError): + df.astype(dtype) + def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b9a667499b7a0..a8319339c6435 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -523,25 +523,23 @@ def test_other_datetime_unit(self): columns=['entity_id', 'days']) tm.assert_frame_equal(result, exp) - def test_other_timedelta_unit(self): + @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns']) + def test_other_timedelta_unit(self, unit): # GH 13389 df1 = pd.DataFrame({'entity_id': [101, 102]}) s = pd.Series([None, None], index=[101, 102], name='days') - for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', - 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', - 'timedelta64[ns]']: + dtype = "m8[{}]".format(unit) + df2 = s.astype(dtype).to_frame('days') + assert df2['days'].dtype == 'm8[ns]' - df2 = s.astype(dtype).to_frame('days') - assert df2['days'].dtype == dtype - - result = df1.merge(df2, left_on='entity_id', right_index=True) + result = df1.merge(df2, left_on='entity_id', right_index=True) - exp = pd.DataFrame({'entity_id': [101, 102], - 'days': np.array(['nat', 'nat'], - dtype=dtype)}, - columns=['entity_id', 'days']) - tm.assert_frame_equal(result, exp) + exp = pd.DataFrame({'entity_id': [101, 102], + 'days': np.array(['nat', 'nat'], + dtype=dtype)}, + columns=['entity_id', 'days']) + tm.assert_frame_equal(result, exp) def test_overlapping_columns_error_message(self): df = DataFrame({'key': [1, 2, 3], diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a3e40f65e922f..33737387edffa 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -552,10 +552,6 @@ def test_constructor_dtype_datetime64(self): s.iloc[0] = np.nan assert s.dtype == 'M8[ns]' - # invalid astypes - for t in ['s', 'D', 'us', 'ms']: - pytest.raises(TypeError, s.astype, 'M8[%s]' % t) - # GH3414 related pytest.raises(TypeError, lambda x: Series( Series(dates).astype('int') / 1000000, dtype='M8[ms]')) @@ -707,6 +703,20 @@ def test_constructor_with_datetime_tz(self): expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected) + @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) + @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): + # tests all units + # gh-19223 + dtype = "{}[{}]".format(dtype, unit) + arr = np.array([1, 2, 3], dtype=arr_dtype) + s = Series(arr) + result = s.astype(dtype) + expected = Series(arr.astype(dtype)) + + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('arg', ['2013-01-01 00:00:00', pd.NaT, np.nan, None]) def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 783fcddac1280..ed9307d50521f 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1649,32 +1649,26 @@ def test_invalid_ops(self): pytest.raises(Exception, self.objSeries.__sub__, np.array(1, dtype=np.int64)) - def test_timedelta64_conversions(self): + @pytest.mark.parametrize("m", [1, 3, 10]) + @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns']) + def test_timedelta64_conversions(self, m, unit): + startdate = Series(date_range('2013-01-01', '2013-01-03')) enddate = Series(date_range('2013-03-01', '2013-03-03')) s1 = enddate - startdate s1[2] = np.nan - for m in [1, 3, 10]: - for unit in ['D', 'h', 'm', 's', 'ms', 'us', 'ns']: - - # op - expected = s1.apply(lambda x: x / np.timedelta64(m, unit)) - result = s1 / np.timedelta64(m, unit) - assert_series_equal(result, expected) - - if m == 1 and unit != 'ns': - - # astype - result = s1.astype("timedelta64[{0}]".format(unit)) - assert_series_equal(result, expected) + # op + expected = s1.apply(lambda x: x / np.timedelta64(m, unit)) + result = s1 / np.timedelta64(m, unit) + assert_series_equal(result, expected) - # reverse op - expected = s1.apply( - lambda x: Timedelta(np.timedelta64(m, unit)) / x) - result = np.timedelta64(m, unit) / s1 - assert_series_equal(result, expected) + # reverse op + expected = s1.apply( + lambda x: Timedelta(np.timedelta64(m, unit)) / x) + result = np.timedelta64(m, unit) / s1 + assert_series_equal(result, expected) # astype s = Series(date_range('20130101', periods=3))