From a60d410dca4ec6acca4bca1ddea335bf3729ea27 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 14 Aug 2016 10:57:29 +0900 Subject: [PATCH] CLN/BUG: fix ndarray assignment may cause unexpected cast --- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/core/frame.py | 22 ++-- pandas/core/internals.py | 176 +++++++++++++------------ pandas/core/panel.py | 13 +- pandas/tests/indexing/test_coercion.py | 81 ++++++++++-- pandas/tests/indexing/test_indexing.py | 23 +++- pandas/tests/series/test_analytics.py | 8 +- pandas/tests/series/test_missing.py | 13 +- pandas/tests/types/test_cast.py | 54 +++++++- pandas/types/cast.py | 62 +++++++-- 10 files changed, 310 insertions(+), 144 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f3a6736ff9920c..12228dd25d43f6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1562,6 +1562,8 @@ Bug Fixes - Bug in ``DatetimeIndex``, ``TimedeltaIndex`` and ``PeriodIndex.equals()`` may return ``True`` when input isn't ``Index`` but contains the same values (:issue:`13107`) - Bug in assignment against datetime with timezone may not work if it contains datetime near DST boundary (:issue:`14146`) +- Bug in assignment against datetime-like data with ``int`` may incorrectly converted to datetime-like (:issue:`14145`) +- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ac3e5d2aabef77..bf105f8d082122 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -24,7 +24,7 @@ import numpy.ma as ma from pandas.types.cast import (_maybe_upcast, - _infer_dtype_from_scalar, + _cast_scalar_to_array, _possibly_cast_to_datetime, _possibly_infer_to_datetimelike, _possibly_convert_platform, @@ -332,15 +332,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, raise_with_traceback(exc) if arr.ndim == 0 and index is not None and columns is not None: - if isinstance(data, compat.string_types) and dtype is None: - dtype = np.object_ - if dtype is None: - dtype, data = _infer_dtype_from_scalar(data) - - values = np.empty((len(index), len(columns)), dtype=dtype) - values.fill(data) - mgr = self._init_ndarray(values, index, columns, dtype=dtype, - copy=False) + values = _cast_scalar_to_array((len(index), len(columns)), + data, dtype=dtype) + mgr = self._init_ndarray(values, index, columns, + dtype=values.dtype, copy=False) else: raise PandasError('DataFrame constructor not properly called!') @@ -454,7 +449,7 @@ def _get_axes(N, K, index=index, columns=columns): values = _prep_ndarray(values, copy=copy) if dtype is not None: - if values.dtype != dtype: + if not is_dtype_equal(values.dtype, dtype): try: values = values.astype(dtype) except Exception as orig: @@ -2653,9 +2648,8 @@ def reindexer(value): else: # upcast the scalar - dtype, value = _infer_dtype_from_scalar(value) - value = np.repeat(value, len(self.index)).astype(dtype) - value = _possibly_cast_to_datetime(value, dtype) + value = _cast_scalar_to_array(len(self.index), value) + value = _possibly_cast_to_datetime(value, value.dtype) # return internal types directly if is_extension_type(value): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 11721a5bdac294..d2d1d77d895841 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -42,7 +42,7 @@ is_null_datelike_scalar) import pandas.types.concat as _concat -from pandas.types.generic import ABCSeries +from pandas.types.generic import ABCSeries, ABCDatetimeIndex from pandas.core.common import is_null_slice import pandas.core.algorithms as algos @@ -378,7 +378,8 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, # fillna, but if we cannot coerce, then try again as an ObjectBlock try: - values, _, value, _ = self._try_coerce_args(self.values, value) + values, _, _, _ = self._try_coerce_args(self.values, value) + # value may be converted to internal, thus drop blocks = self.putmask(mask, value, inplace=inplace) blocks = [b.make_block(values=self._try_coerce_result(b.values)) for b in blocks] @@ -666,8 +667,43 @@ def setitem(self, indexer, value, mgr=None): if self.is_numeric: value = np.nan - # coerce args - values, _, value, _ = self._try_coerce_args(self.values, value) + # coerce if block dtype can store value + values = self.values + try: + values, _, value, _ = self._try_coerce_args(values, value) + # can keep its own dtype + if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, + value.dtype): + dtype = self.dtype + else: + dtype = 'infer' + + except (TypeError, ValueError): + # current dtype cannot store value, coerce to common dtype + find_dtype = False + + if hasattr(value, 'dtype'): + dtype = value.dtype + find_dtype = True + + elif is_scalar(value): + if isnull(value): + # NaN promotion is handled in latter path + dtype = False + else: + dtype, _ = _infer_dtype_from_scalar(value, + pandas_dtype=True) + find_dtype = True + else: + dtype = 'infer' + + if find_dtype: + dtype = _find_common_type([values.dtype, dtype]) + if not is_dtype_equal(self.dtype, dtype): + b = self.astype(dtype) + return b.setitem(indexer, value, mgr=mgr) + + # value must be storeable at this moment arr_value = np.array(value) # cast the values to a type that can hold nan (if necessary) @@ -697,87 +733,52 @@ def setitem(self, indexer, value, mgr=None): raise ValueError("cannot set using a slice indexer with a " "different length than the value") - try: - - def _is_scalar_indexer(indexer): - # return True if we are all scalar indexers - - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return all([is_scalar(idx) for idx in indexer]) - return False - - def _is_empty_indexer(indexer): - # return a boolean if we have an empty indexer + def _is_scalar_indexer(indexer): + # return True if we are all scalar indexers - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 - for idx in indexer) - return False - - # empty indexers - # 8669 (empty) - if _is_empty_indexer(indexer): - pass - - # setting a single element for each dim and with a rhs that could - # be say a list - # GH 6043 - elif _is_scalar_indexer(indexer): - values[indexer] = value - - # if we are an exact match (ex-broadcasting), - # then use the resultant dtype - elif (len(arr_value.shape) and - arr_value.shape[0] == values.shape[0] and - np.prod(arr_value.shape) == np.prod(values.shape)): - values[indexer] = value - values = values.astype(arr_value.dtype) - - # set - else: - values[indexer] = value - - # coerce and try to infer the dtypes of the result - if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, - value.dtype): - dtype = value.dtype - elif is_scalar(value): - dtype, _ = _infer_dtype_from_scalar(value) - else: - dtype = 'infer' - values = self._try_coerce_and_cast_result(values, dtype) - block = self.make_block(transf(values), fastpath=True) - - # may have to soft convert_objects here - if block.is_object and not self.is_object: - block = block.convert(numeric=False) - - return block - except ValueError: - raise - except TypeError: + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return all([is_scalar(idx) for idx in indexer]) + return False - # cast to the passed dtype if possible - # otherwise raise the original error - try: - # e.g. we are uint32 and our value is uint64 - # this is for compat with older numpies - block = self.make_block(transf(values.astype(value.dtype))) - return block.setitem(indexer=indexer, value=value, mgr=mgr) + def _is_empty_indexer(indexer): + # return a boolean if we have an empty indexer - except: - pass - - raise + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 + for idx in indexer) + return False - except Exception: + # empty indexers + # 8669 (empty) + if _is_empty_indexer(indexer): pass - return [self] + # setting a single element for each dim and with a rhs that could + # be say a list + # GH 6043 + elif _is_scalar_indexer(indexer): + values[indexer] = value + + # if we are an exact match (ex-broadcasting), + # then use the resultant dtype + elif (len(arr_value.shape) and + arr_value.shape[0] == values.shape[0] and + np.prod(arr_value.shape) == np.prod(values.shape)): + values[indexer] = value + values = values.astype(arr_value.dtype) + + # set + else: + values[indexer] = value + + # coerce and try to infer the dtypes of the result + values = self._try_coerce_and_cast_result(values, dtype) + block = self.make_block(transf(values), fastpath=True) + return block def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False, mgr=None): @@ -1241,6 +1242,7 @@ def func(cond, values, other): values, values_mask, other, other_mask = self._try_coerce_args( values, other) + try: return self._try_coerce_result(expressions.where( cond, values, other, raise_on_error=True)) @@ -1497,6 +1499,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new = new[mask] mask = _safe_reshape(mask, new_values.shape) + new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] @@ -1666,7 +1669,7 @@ def fillna(self, value, **kwargs): # allow filling with integers to be # interpreted as seconds - if not isinstance(value, np.timedelta64) and is_integer(value): + if not isinstance(value, np.timedelta64): value = Timedelta(value, unit='s') return super(TimeDeltaBlock, self).fillna(value, **kwargs) @@ -1898,6 +1901,15 @@ def _maybe_downcast(self, blocks, downcast=None): def _can_hold_element(self, element): return True + def _try_coerce_args(self, values, other): + """ provide coercion to our input arguments """ + + if isinstance(other, ABCDatetimeIndex): + # to store DatetimeTZBlock as object + other = other.asobject.values + + return values, False, other, False + def _try_cast(self, element): return element @@ -2234,8 +2246,6 @@ def _try_coerce_args(self, values, other): "naive Block") other_mask = isnull(other) other = other.asm8.view('i8') - elif hasattr(other, 'dtype') and is_integer_dtype(other): - other = other.view('i8') else: try: other = np.asarray(other) @@ -2411,6 +2421,8 @@ def _try_coerce_args(self, values, other): raise ValueError("incompatible or non tz-aware value") other_mask = isnull(other) other = other.value + else: + raise TypeError return values, values_mask, other, other_mask diff --git a/pandas/core/panel.py b/pandas/core/panel.py index f708774dd84ff1..a957e8bf8f39eb 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -9,6 +9,7 @@ import numpy as np from pandas.types.cast import (_infer_dtype_from_scalar, + _cast_scalar_to_array, _possibly_cast_item) from pandas.types.common import (is_integer, is_list_like, is_string_like, is_scalar) @@ -166,11 +167,9 @@ def _init_data(self, data, copy, dtype, **kwargs): copy = False dtype = None elif is_scalar(data) and all(x is not None for x in passed_axes): - if dtype is None: - dtype, data = _infer_dtype_from_scalar(data) - values = np.empty([len(x) for x in passed_axes], dtype=dtype) - values.fill(data) - mgr = self._init_matrix(values, passed_axes, dtype=dtype, + values = _cast_scalar_to_array([len(x) for x in passed_axes], + data, dtype=dtype) + mgr = self._init_matrix(values, passed_axes, dtype=values.dtype, copy=False) copy = False else: # pragma: no cover @@ -570,9 +569,7 @@ def __setitem__(self, key, value): shape[1:], tuple(map(int, value.shape)))) mat = np.asarray(value) elif is_scalar(value): - dtype, value = _infer_dtype_from_scalar(value) - mat = np.empty(shape[1:], dtype=dtype) - mat.fill(value) + mat = _cast_scalar_to_array(shape[1:], value) else: raise TypeError('Cannot set item of type: %s' % str(type(value))) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 0cfa7258461f11..6d0034a1a3a9fb 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -107,6 +107,19 @@ def test_setitem_series_int64(self): exp = pd.Series([1, 1, 3, 4]) self._assert_setitem_series_conversion(obj, True, exp, np.int64) + def test_setitem_series_int8(self): + # integer dtype coercion (no change) + obj = pd.Series([1, 2, 3, 4], dtype=np.int8) + self.assertEqual(obj.dtype, np.int8) + + exp = pd.Series([1, 1, 3, 4], dtype=np.int8) + self._assert_setitem_series_conversion(obj, np.int32(1), exp, np.int8) + + # BUG: it must be Series([1, 1, 3, 4], dtype=np.int16) + exp = pd.Series([1, 0, 3, 4], dtype=np.int8) + self._assert_setitem_series_conversion(obj, np.int16(2**9), exp, + np.int8) + def test_setitem_series_float64(self): obj = pd.Series([1.1, 2.2, 3.3, 4.4]) self.assertEqual(obj.dtype, np.float64) @@ -207,6 +220,13 @@ def test_setitem_series_datetime64(self): pd.Timestamp('2011-01-04')]) self._assert_setitem_series_conversion(obj, 1, exp, 'datetime64[ns]') + # datetime64 + object -> object + exp = pd.Series([pd.Timestamp('2011-01-01'), + 'x', + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_setitem_series_conversion(obj, 'x', exp, np.object) + # ToDo: add more tests once the above issue has been fixed def test_setitem_series_datetime64tz(self): @@ -226,19 +246,62 @@ def test_setitem_series_datetime64tz(self): self._assert_setitem_series_conversion(obj, value, exp, 'datetime64[ns, US/Eastern]') + # datetime64tz + datetime64tz (different tz) -> object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz='US/Pacific'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz='US/Pacific') + self._assert_setitem_series_conversion(obj, value, exp, np.object) + + # datetime64tz + datetime64 -> object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01') + self._assert_setitem_series_conversion(obj, value, exp, np.object) + # datetime64 + int -> object - # ToDo: The result must be object exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp(1, tz=tz), + 1, pd.Timestamp('2011-01-03', tz=tz), pd.Timestamp('2011-01-04', tz=tz)]) - self._assert_setitem_series_conversion(obj, 1, exp, - 'datetime64[ns, US/Eastern]') + self._assert_setitem_series_conversion(obj, 1, exp, np.object) # ToDo: add more tests once the above issue has been fixed def test_setitem_series_timedelta64(self): - pass + obj = pd.Series([pd.Timedelta('1 day'), + pd.Timedelta('2 day'), + pd.Timedelta('3 day'), + pd.Timedelta('4 day')]) + self.assertEqual(obj.dtype, 'timedelta64[ns]') + + # timedelta64 + timedelta64 -> timedelta64 + exp = pd.Series([pd.Timedelta('1 day'), + pd.Timedelta('12 day'), + pd.Timedelta('3 day'), + pd.Timedelta('4 day')]) + self._assert_setitem_series_conversion(obj, pd.Timedelta('12 day'), + exp, 'timedelta64[ns]') + + # timedelta64 + int -> object + # ToDo: The result must be object + exp = pd.Series([pd.Timedelta('1 day'), + pd.Timedelta(1), + pd.Timedelta('3 day'), + pd.Timedelta('4 day')]) + self._assert_setitem_series_conversion(obj, 1, exp, 'timedelta64[ns]') + + # timedelta64 + object -> object + exp = pd.Series([pd.Timedelta('1 day'), + 'x', + pd.Timedelta('3 day'), + pd.Timedelta('4 day')]) + self._assert_setitem_series_conversion(obj, 'x', exp, np.object) + + # ToDo: add more tests once the above issue has been fixed def test_setitem_series_period(self): pass @@ -1035,14 +1098,12 @@ def test_fillna_series_datetime64tz(self): value = pd.Timestamp('2012-01-01', tz='Asia/Tokyo') self._assert_fillna_conversion(obj, value, exp, np.object) - # datetime64tz + int => datetime64tz - # ToDo: must be object + # datetime64tz + int => object exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp(1, tz=tz), + 1, pd.Timestamp('2011-01-03', tz=tz), pd.Timestamp('2011-01-04', tz=tz)]) - self._assert_fillna_conversion(obj, 1, exp, - 'datetime64[ns, US/Eastern]') + self._assert_fillna_conversion(obj, 1, exp, np.object) # datetime64tz + object => object exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e0d63d5aa0c443..f5ecce2848f521 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -969,18 +969,25 @@ def test_indexing_with_datetime_tz(self): tm.assert_frame_equal(result, expected) # indexing - setting an element - df = DataFrame(data=pd.to_datetime( - ['2015-03-30 20:12:32', '2015-03-12 00:11:11']), columns=['time']) + df = DataFrame(data=pd.to_datetime(['2015-03-30 20:12:32', + '2015-03-12 00:11:11']), + columns=['time']) df['new_col'] = ['new', 'old'] df.time = df.set_index('time').index.tz_localize('UTC') v = df[df.new_col == 'new'].set_index('time').index.tz_convert( 'US/Pacific') # trying to set a single element on a part of a different timezone - def f(): - df.loc[df.new_col == 'new', 'time'] = v + df2 = df.copy() + df2.loc[df2.new_col == 'new', 'time'] = v - self.assertRaises(ValueError, f) + exp = pd.DataFrame({'time': [pd.Timestamp('2015-03-30 13:12:32', + tz='US/Pacific'), + pd.Timestamp('2015-03-12 00:11:11', + tz='UTC')], + 'new_col': ['new', 'old']}, + columns=['time', 'new_col']) + tm.assert_frame_equal(df2, exp) v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') df.loc[df.new_col == 'new', 'time'] = v @@ -3311,6 +3318,12 @@ def test_multi_assign(self): tm.assert_frame_equal(df2, expected) # with an ndarray on rhs + # coerces to float64 because values has float64 dtype + # GH 14001 + expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], + 'PF': [0, 0, 0, 0, 1, 1], + 'col1': [0., 1., 4., 6., 8., 10.], + 'col2': [12, 7, 16, np.nan, 20, 22]}) df2 = df.copy() df2.ix[mask, cols] = dft.ix[mask, cols].values tm.assert_frame_equal(df2, expected) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 24e3a0ff5f3259..9052e1e4db16cf 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1072,11 +1072,11 @@ def test_clip_with_datetimes(self): # naive and tz-aware datetimes t = Timestamp('2015-12-01 09:30:30') - s = Series([Timestamp('2015-12-01 09:30:00'), Timestamp( - '2015-12-01 09:31:00')]) + s = Series([Timestamp('2015-12-01 09:30:00'), + Timestamp('2015-12-01 09:31:00')]) result = s.clip(upper=t) - expected = Series([Timestamp('2015-12-01 09:30:00'), Timestamp( - '2015-12-01 09:30:30')]) + expected = Series([Timestamp('2015-12-01 09:30:00'), + Timestamp('2015-12-01 09:30:30')]) assert_series_equal(result, expected) t = Timestamp('2015-12-01 09:30:30', tz='US/Eastern') diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 4e6c58df54dfd8..d31b9ed275cb6c 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -40,14 +40,14 @@ class TestSeriesMissingData(TestData, tm.TestCase): def test_timedelta_fillna(self): # GH 3371 - s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp( - '20130102'), Timestamp('20130103 9:01:01')]) + s = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130102'), Timestamp('20130103 9:01:01')]) td = s.diff() # reg fillna result = td.fillna(0) - expected = Series([timedelta(0), timedelta(0), timedelta(1), timedelta( - days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series([timedelta(0), timedelta(0), timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) assert_series_equal(result, expected) # interprested as seconds @@ -57,8 +57,9 @@ def test_timedelta_fillna(self): assert_series_equal(result, expected) result = td.fillna(timedelta(days=1, seconds=1)) - expected = Series([timedelta(days=1, seconds=1), timedelta( - 0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series([timedelta(days=1, seconds=1), timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) assert_series_equal(result, expected) result = td.fillna(np.timedelta64(int(1e9))) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index 56a14a51105ca9..e2b7e9e2cb3d72 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -10,12 +10,13 @@ from datetime import datetime import numpy as np -from pandas import Timedelta, Timestamp +from pandas import Timedelta, Timestamp, Period from pandas.types.cast import (_possibly_downcast_to_dtype, _possibly_convert_objects, _infer_dtype_from_scalar, _maybe_convert_string_to_object, _maybe_convert_scalar, + _cast_scalar_to_array, _find_common_type) from pandas.types.dtypes import (CategoricalDtype, DatetimeTZDtype, PeriodDtype) @@ -120,11 +121,58 @@ def test_infer_dtype_from_scalar(self): dtype, val = _infer_dtype_from_scalar(data) self.assertEqual(dtype, 'm8[ns]') - for data in [datetime.date(2000, 1, 1), - Timestamp(1, tz='US/Eastern'), 'foo']: + for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo']: + dt = Timestamp(1, tz=tz) + dtype, val = _infer_dtype_from_scalar(dt, pandas_dtype=True) + self.assertEqual(dtype, 'datetime64[ns, {0}]'.format(tz)) + self.assertEqual(val, dt.value) + + dtype, val = _infer_dtype_from_scalar(dt) + self.assertEqual(dtype, np.object_) + self.assertEqual(val, dt) + + for freq in ['M', 'D']: + p = Period('2011-01-01', freq=freq) + dtype, val = _infer_dtype_from_scalar(p, pandas_dtype=True) + self.assertEqual(dtype, 'period[{0}]'.format(freq)) + self.assertEqual(val, p.ordinal) + + dtype, val = _infer_dtype_from_scalar(p) + self.assertEqual(dtype, np.object_) + self.assertEqual(val, p) + + for data in [datetime.date(2000, 1, 1), 'foo']: dtype, val = _infer_dtype_from_scalar(data) self.assertEqual(dtype, np.object_) + def test_cast_scalar_to_array(self): + arr = _cast_scalar_to_array((3, 2), 1, dtype=np.int64) + exp = np.ones((3, 2), dtype=np.int64) + tm.assert_numpy_array_equal(arr, exp) + + arr = _cast_scalar_to_array((3, 2), 1.1) + exp = np.empty((3, 2), dtype=np.float64) + exp.fill(1.1) + tm.assert_numpy_array_equal(arr, exp) + + arr = _cast_scalar_to_array((2, 3), Timestamp('2011-01-01')) + exp = np.empty((2, 3), dtype='datetime64[ns]') + exp.fill(np.datetime64('2011-01-01')) + tm.assert_numpy_array_equal(arr, exp) + + # pandas dtype is stored as object dtype + obj = Timestamp('2011-01-01', tz='US/Eastern') + arr = _cast_scalar_to_array((2, 3), obj) + exp = np.empty((2, 3), dtype=np.object) + exp.fill(obj) + tm.assert_numpy_array_equal(arr, exp) + + obj = Period('2011-01-01', freq='D') + arr = _cast_scalar_to_array((2, 3), obj) + exp = np.empty((2, 3), dtype=np.object) + exp.fill(obj) + tm.assert_numpy_array_equal(arr, exp) + class TestMaybe(tm.TestCase): diff --git a/pandas/types/cast.py b/pandas/types/cast.py index a79862eb195b6c..61d851996e9efa 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -3,7 +3,7 @@ from datetime import datetime, timedelta import numpy as np from pandas import lib, tslib -from pandas.tslib import iNaT +from pandas.tslib import iNaT, NaT, Timestamp from pandas.compat import string_types, text_type, PY3 from .common import (_ensure_object, is_bool, is_integer, is_float, is_complex, is_datetimetz, is_categorical_dtype, @@ -19,7 +19,7 @@ _ensure_int32, _ensure_int64, _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, _DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) -from .dtypes import ExtensionDtype +from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries from .missing import isnull, notnull from .inference import is_list_like @@ -248,7 +248,7 @@ def _maybe_promote(dtype, fill_value=np.nan): else: if issubclass(dtype.type, np.datetime64): try: - fill_value = lib.Timestamp(fill_value).value + fill_value = Timestamp(fill_value).value except: # the proper thing to do here would probably be to upcast # to object (but numpy 1.6.1 doesn't do this properly) @@ -309,16 +309,24 @@ def _maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def _infer_dtype_from_scalar(val): - """ interpret the dtype from a scalar """ +def _infer_dtype_from_scalar(val, pandas_dtype=False): + """ + interpret the dtype from a scalar + + Parameters + ---------- + pandas_dtype : bool, default False + whether to infer dtype as numpy compat (not include pandas + extension types) + """ dtype = np.object_ # a 1-element ndarray if isinstance(val, np.ndarray): + msg = "invalid ndarray passed to _infer_dtype_from_scalar" if val.ndim != 0: - raise ValueError( - "invalid ndarray passed to _infer_dtype_from_scalar") + raise ValueError(msg) dtype = val.dtype val = val.item() @@ -333,10 +341,18 @@ def _infer_dtype_from_scalar(val): dtype = np.object_ - elif isinstance(val, (np.datetime64, - datetime)) and getattr(val, 'tzinfo', None) is None: - val = lib.Timestamp(val).value - dtype = np.dtype('M8[ns]') + elif isinstance(val, (np.datetime64, datetime)): + val = Timestamp(val) + if val is NaT or val.tz is None: + dtype = np.dtype('M8[ns]') + else: + + if pandas_dtype: + dtype = DatetimeTZDtype(unit='ns', tz=val.tz) + else: + # return datetimetz as object + return np.object_, val + val = val.value elif isinstance(val, (np.timedelta64, timedelta)): val = lib.Timedelta(val).value @@ -360,6 +376,12 @@ def _infer_dtype_from_scalar(val): elif is_complex(val): dtype = np.complex_ + elif pandas_dtype: + from pandas.tseries.period import Period + if isinstance(val, Period): + dtype = PeriodDtype(freq=val.freq) + val = val.ordinal + return dtype, val @@ -463,7 +485,7 @@ def conv(r, dtype): if isnull(r): pass elif dtype == _NS_DTYPE: - r = lib.Timestamp(r) + r = Timestamp(r) elif dtype == _TD_DTYPE: r = _coerce_scalar_to_timedelta_type(r) elif dtype == np.bool_: @@ -886,3 +908,19 @@ def _find_common_type(types): return np.dtype('timedelta64[ns]') return np.find_common_type(types, []) + + +def _cast_scalar_to_array(shape, value, dtype=None): + """ + create np.ndarray of specified shape and dtype, filled with values + """ + + if dtype is None: + dtype, fill_value = _infer_dtype_from_scalar(value) + else: + fill_value = value + + values = np.empty(shape, dtype=dtype) + values.fill(fill_value) + + return values