From 405f9f1f9b3ec2e1e1adfd8652e65ec16b316d64 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 18 May 2017 21:42:26 -0400 Subject: [PATCH] rebase & cleanup, fixup some edge cases closes #16402 --- doc/source/whatsnew/v0.21.0.txt | 3 +- pandas/_libs/index.pyx | 19 +- pandas/core/dtypes/cast.py | 18 +- pandas/core/frame.py | 11 +- pandas/core/generic.py | 43 --- pandas/core/internals.py | 375 +++++++++++++++---------- pandas/core/panel.py | 7 +- pandas/tests/dtypes/test_cast.py | 154 ++++------ pandas/tests/frame/test_operators.py | 1 + pandas/tests/indexing/test_coercion.py | 36 ++- pandas/tests/indexing/test_datetime.py | 10 +- pandas/tests/indexing/test_indexing.py | 3 +- pandas/tests/series/test_indexing.py | 43 ++- 13 files changed, 383 insertions(+), 340 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index e337fb7b2377b..e66399cfdd63d 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -84,7 +84,8 @@ Bug Fixes Conversion ^^^^^^^^^^ -- Bug in assignment against datetime-like data with ``int`` may incorrectly converted to datetime-like (:issue:`14145`) +- Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`) +- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 21680fb0b3921..60eb2742a7b9a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -19,6 +19,7 @@ cimport tslib from hashtable cimport * from pandas._libs import tslib, algos, hashtable as _hash from pandas._libs.tslib import Timestamp, Timedelta +from datetime import datetime, timedelta from datetime cimport (get_datetime64_value, _pydatetime_to_dts, pandas_datetimestruct) @@ -507,24 +508,30 @@ cdef class TimedeltaEngine(DatetimeEngine): return 'm8[ns]' cpdef convert_scalar(ndarray arr, object value): + # we don't turn intgers + # into datetimes/timedeltas + if arr.descr.type_num == NPY_DATETIME: if isinstance(value, np.ndarray): pass - elif isinstance(value, Timestamp): - return value.value + elif isinstance(value, datetime): + return Timestamp(value).value elif value is None or value != value: return iNaT - else: + elif util.is_string_object(value): return Timestamp(value).value + raise ValueError("cannot set a Timestamp with a non-timestamp") + elif arr.descr.type_num == NPY_TIMEDELTA: if isinstance(value, np.ndarray): pass - elif isinstance(value, Timedelta): - return value.value + elif isinstance(value, timedelta): + return Timedelta(value).value elif value is None or value != value: return iNaT - else: + elif util.is_string_object(value): return Timedelta(value).value + raise ValueError("cannot set a Timedelta with a non-timedelta") if issubclass(arr.dtype.type, (np.integer, np.bool_)): if util.is_float_object(value) and value != value: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7fec2a78f1b31..ccbcda6ad4d4d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,7 +6,7 @@ import warnings from pandas._libs import tslib, lib -from pandas._libs.tslib import iNaT +from pandas._libs.tslib import iNaT, Timestamp from pandas.compat import string_types, text_type, PY3 from .common import (_ensure_object, is_bool, is_integer, is_float, is_complex, is_datetimetz, is_categorical_dtype, @@ -1028,13 +1028,25 @@ def find_common_type(types): return np.find_common_type(types, []) -def _cast_scalar_to_array(shape, value, dtype=None): +def cast_scalar_to_array(shape, value, dtype=None): """ create np.ndarray of specified shape and dtype, filled with values + + Parameters + ---------- + shape : tuple + value : scalar value + dtype : np.dtype, optional + dtype to coerce + + Returns + ------- + ndarray of shape, filled with value, of specified / inferred dtype + """ if dtype is None: - dtype, fill_value = _infer_dtype_from_scalar(value) + dtype, fill_value = infer_dtype_from_scalar(value) else: fill_value = value diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 71bf096c6f42a..c18ab41744486 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -25,7 +25,7 @@ import numpy.ma as ma from pandas.core.dtypes.cast import ( - maybe_upcast, infer_dtype_from_scalar, + maybe_upcast, maybe_cast_to_datetime, maybe_infer_to_datetimelike, maybe_convert_platform, @@ -33,6 +33,7 @@ invalidate_string_dtypes, coerce_to_dtypes, maybe_upcast_putmask, + cast_scalar_to_array, find_common_type) from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -343,8 +344,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, raise_with_traceback(exc) if arr.ndim == 0 and index is not None and columns is not None: - values = _cast_scalar_to_array((len(index), len(columns)), - data, dtype=dtype) + values = cast_scalar_to_array((len(index), len(columns)), + data, dtype=dtype) mgr = self._init_ndarray(values, index, columns, dtype=values.dtype, copy=False) else: @@ -2734,8 +2735,8 @@ def reindexer(value): else: # upcast the scalar - value = _cast_scalar_to_array(len(self.index), value) - value = _possibly_cast_to_datetime(value, value.dtype) + value = cast_scalar_to_array(len(self.index), value) + value = maybe_cast_to_datetime(value, value.dtype) # return internal types directly if is_extension_type(value): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2e7d8693d48dd..b24460982b575 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12,7 +12,6 @@ from pandas._libs import tslib, lib from pandas.core.dtypes.common import ( _ensure_int64, - needs_i8_conversion, is_scalar, is_number, is_integer, is_bool, @@ -5030,48 +5029,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, raise NotImplemented("cannot align with a higher dimensional " "NDFrame") - elif is_list_like(other): - - if self.ndim == 1: - - # try to set the same dtype as ourselves - try: - new_other = np.array(other, dtype=self.dtype) - except ValueError: - new_other = np.array(other) - except TypeError: - new_other = other - - # we can end up comparing integers and m8[ns] - # which is a numpy no no - is_i8 = needs_i8_conversion(self.dtype) - if is_i8: - matches = False - else: - matches = (new_other == np.array(other)) - - if matches is False or not matches.all(): - - # coerce other to a common dtype if we can - if needs_i8_conversion(self.dtype): - try: - other = np.array(other, dtype=self.dtype) - except: - other = np.array(other) - else: - other = np.asarray(other) - other = np.asarray(other, - dtype=np.common_type(other, - new_other)) - - # we need to use the new dtype - try_quick = False - else: - other = new_other - else: - - other = np.array(other) - if isinstance(other, np.ndarray): if other.shape != self.shape: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b9913722f86e2..0850d114f9d34 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -14,8 +14,10 @@ CategoricalDtype) from pandas.core.dtypes.common import ( _TD_DTYPE, _NS_DTYPE, - _ensure_int64, _ensure_platform_int, + _ensure_int64, + _ensure_platform_int, is_integer, + is_number, is_dtype_equal, is_timedelta64_dtype, is_datetime64_dtype, is_datetimetz, is_sparse, @@ -33,10 +35,10 @@ _get_dtype) from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, - maybe_convert_string_to_object, maybe_upcast, - maybe_convert_scalar, maybe_promote, + maybe_promote, infer_dtype_from_scalar, + infer_dtype_from_array, soft_convert_objects, maybe_convert_objects, astype_nansafe, @@ -366,6 +368,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, """ fillna on the block with the value. If we fail, then convert to ObjectBlock and try again """ + inplace = validate_bool_kwarg(inplace, 'inplace') if not self._can_hold_na: @@ -388,8 +391,6 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, # fillna, but if we cannot coerce, then try again as an ObjectBlock try: - values, _, _, _ = self._try_coerce_args(self.values, value) - # value may be converted to internal, thus drop blocks = self.putmask(mask, value, inplace=inplace) blocks = [b.make_block(values=self._try_coerce_result(b.values)) for b in blocks] @@ -658,11 +659,8 @@ def replace(self, to_replace, value, inplace=False, filter=None, return blocks except (TypeError, ValueError): - # we can't process the value, but nothing to do - if not mask.any(): - return self if inplace else self.copy() - - return self.to_object_block(mgr=mgr).replace( + block = self.to_object_block(mgr) + return block.replace( to_replace=original_to_replace, value=value, inplace=inplace, filter=filter, regex=regex, convert=convert) @@ -677,50 +675,21 @@ def setitem(self, indexer, value, mgr=None): indexer is a direct slice/positional indexer; value must be a compatible shape """ - # coerce None values, if appropriate if value is None: if self.is_numeric: value = np.nan - # coerce if block dtype can store value - values = self.values + # coerce args try: - values, _, value, _ = self._try_coerce_args(values, value) - # can keep its own dtype - if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, - value.dtype): - dtype = self.dtype - else: - dtype = 'infer' - - except (TypeError, ValueError): - # current dtype cannot store value, coerce to common dtype - find_dtype = False - - if hasattr(value, 'dtype'): - dtype = value.dtype - find_dtype = True - - elif is_scalar(value): - if isnull(value): - # NaN promotion is handled in latter path - dtype = False - else: - dtype, _ = _infer_dtype_from_scalar(value, - pandas_dtype=True) - find_dtype = True - else: - dtype = 'infer' - - if find_dtype: - dtype = _find_common_type([values.dtype, dtype]) - if not is_dtype_equal(self.dtype, dtype): - b = self.astype(dtype) - return b.setitem(indexer, value, mgr=mgr) + values, _, value, _ = self._try_coerce_args(self.values, value) + arr_value = np.array(value) + except (ValueError, TypeError): - # value must be storeable at this moment - arr_value = np.array(value) + # coercion has failed to the current type + # upcast to object + block = self.to_object_block(mgr) + return block.setitem(indexer, value, mgr=mgr) # cast the values to a type that can hold nan (if necessary) if not self._can_hold_element(value): @@ -749,8 +718,19 @@ def setitem(self, indexer, value, mgr=None): raise ValueError("cannot set using a slice indexer with a " "different length than the value") - def _is_scalar_indexer(indexer): - # return True if we are all scalar indexers + try: + + def _is_scalar_indexer(indexer): + # return True if we are all scalar indexers + + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return all([is_scalar(idx) for idx in indexer]) + return False + + def _is_empty_indexer(indexer): + # return a boolean if we have an empty indexer if arr_value.ndim == 1: if not isinstance(indexer, tuple): @@ -802,43 +782,23 @@ def _is_scalar_indexer(indexer): raise except TypeError: - def _is_empty_indexer(indexer): - # return a boolean if we have an empty indexer + # cast to the passed dtype if possible + # otherwise raise the original error + try: + # e.g. we are uint32 and our value is uint64 + # this is for compat with older numpies + block = self.make_block(transf(values.astype(value.dtype))) + return block.setitem(indexer=indexer, value=value, mgr=mgr) - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 - for idx in indexer) - return False + except: + pass - # empty indexers - # 8669 (empty) - if _is_empty_indexer(indexer): - pass + raise - # setting a single element for each dim and with a rhs that could - # be say a list - # GH 6043 - elif _is_scalar_indexer(indexer): - values[indexer] = value - - # if we are an exact match (ex-broadcasting), - # then use the resultant dtype - elif (len(arr_value.shape) and - arr_value.shape[0] == values.shape[0] and - np.prod(arr_value.shape) == np.prod(values.shape)): - values[indexer] = value - values = values.astype(arr_value.dtype) - - # set - else: - values[indexer] = value + except Exception: + pass - # coerce and try to infer the dtypes of the result - values = self._try_coerce_and_cast_result(values, dtype) - block = self.make_block(transf(values), fastpath=True) - return block + return [self] def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False, mgr=None): @@ -888,6 +848,23 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new, new_values.shape[-1]).reshape(self.shape) new = new.astype(new_values.dtype) + # we require exact matches between the len of the + # values we are setting (or is compat). np.putmask + # doesn't check this and will simply truncate / pad + # the output, but we want sane error messages + # + # TODO: this prob needs some better checking + # for 2D cases + if ((is_list_like(new) and + np.any(mask[mask]) and + getattr(new, 'ndim', 1) == 1)): + + if not (mask.shape[-1] == len(new) or + mask[mask].shape[-1] == len(new) or + len(new) == 1): + raise ValueError("cannot assign mismatch " + "length to masked array") + np.putmask(new_values, mask, new) # maybe upcast me @@ -905,6 +882,17 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new_shape.insert(axis, 1) new = new.reshape(tuple(new_shape)) + # we will raise with an incompt type here + try: + self._try_coerce_args(new_values, new) + except (ValueError, TypeError): + # coercion has failed to the current type + # upcast to object + block = self.to_object_block(mgr) + return block.putmask(mask=mask, new=new, align=align, + inplace=inplace, axis=axis, + transpose=transpose, mgr=mgr) + # need to go column by column new_blocks = [] if self.ndim > 1: @@ -937,6 +925,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new_blocks.append(block) else: + nv = _putmask_smart(new_values, mask, new) new_blocks.append(self.make_block(values=nv, fastpath=True)) @@ -1180,8 +1169,17 @@ def eval(self, func, other, raise_on_error=True, try_cast=False, mgr=None): transf = (lambda x: x.T) if is_transposed else (lambda x: x) # coerce/transpose the args if needed - values, values_mask, other, other_mask = self._try_coerce_args( - transf(values), other) + try: + values, values_mask, other, other_mask = self._try_coerce_args( + transf(values), other) + except (ValueError, TypeError): + + # coercion has failed to the current type + # upcast to object + block = self.to_object_block(mgr) + return block.eval(func=func, other=other, + raise_on_error=raise_on_error, + try_cast=try_cast, mgr=None) # get the result, may need to transpose the other def get_result(other): @@ -1210,19 +1208,6 @@ def get_result(other): return self._try_coerce_result(result) - # error handler if we have an issue operating with the function - def handle_error(): - - if raise_on_error: - # The 'detail' variable is defined in outer scope. - raise TypeError('Could not operate %s with block values %s' % - (repr(other), str(detail))) # noqa - else: - # return the values - result = np.empty(values.shape, dtype='O') - result.fill(np.nan) - return result - # get the result try: with np.errstate(all='ignore'): @@ -1232,8 +1217,19 @@ def handle_error(): # GH4576, so raise instead of allowing to pass through except ValueError as detail: raise + + # convert these to TypeErrors + except NotImplementedError as detail: + raise TypeError(detail) + except Exception as detail: - result = handle_error() + + if raise_on_error: + raise + + # return the values + result = np.empty(values.shape, dtype='O') + result.fill(np.nan) # technically a broadcast error in numpy can 'work' by returning a # boolean False @@ -1299,29 +1295,35 @@ def where(self, other, cond, align=True, raise_on_error=True, raise ValueError("where must have a condition that is ndarray " "like") - other = maybe_convert_string_to_object(other) - other = maybe_convert_scalar(other) + try: + values, _, other, _ = self._try_coerce_args(values, other) + except (ValueError, TypeError) as detail: + + # coerce to object + block = self.to_object_block(mgr) + return block.where(other, cond, align=align, + raise_on_error=raise_on_error, + try_cast=try_cast, axis=axis, + transpose=transpose, mgr=mgr) # our where function def func(cond, values, other): if cond.ravel().all(): return values - values, values_mask, other, other_mask = self._try_coerce_args( - values, other) - try: - return self._try_coerce_result(expressions.where( - cond, values, other, raise_on_error=True)) + result = expressions.where( + cond, values, other, raise_on_error=True) + return self._try_coerce_result(result) except Exception as detail: + if raise_on_error: - raise TypeError('Could not operate [%s] with block values ' - '[%s]' % (repr(other), str(detail))) - else: - # return the values - result = np.empty(values.shape, dtype='float64') - result.fill(np.nan) - return result + raise + + # return the values + result = np.empty(values.shape, dtype='float64') + result.fill(np.nan) + return result # see if we can operate on the entire block, or need item-by-item # or if we are a single block (ndim == 1) @@ -1589,7 +1591,6 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new = new[mask] mask = _safe_reshape(mask, new_values.shape) - new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] @@ -1720,6 +1721,41 @@ def _try_cast(self, element): def should_store(self, value): return is_integer_dtype(value) and value.dtype == self.dtype + def _try_coerce_args(self, values, other): + """ + For int-dtypes we don't want to operate directly with others that + are strings. numpy will coerce all to strings, so we will raise + and the higher level can handle that (generally by converting + to object) + + Parameters + ---------- + values : ndarray-like + other : ndarray-like or scalar + + Returns + ------- + base-type values, values mask, base-type other, other mask + """ + if is_list_like(other): + + dtype, _ = infer_dtype_from_array(other) + if is_object_dtype(dtype): + + # coercion issues + # let higher levels handle + raise TypeError("cannot convert {} to an Int".format( + type(other).__name__)) + + elif is_scalar(other) and not is_number(other): + + # coercion issues + # let higher levels handle + raise TypeError("cannot convert {} to an Int".format( + type(other).__name__)) + + return values, False, other, False + class DatetimeLikeBlockMixin(object): @@ -1759,7 +1795,7 @@ def fillna(self, value, **kwargs): # allow filling with integers to be # interpreted as seconds - if not isinstance(value, np.timedelta64): + if not isinstance(value, np.timedelta64) and is_integer(value): value = Timedelta(value, unit='s') return super(TimeDeltaBlock, self).fillna(value, **kwargs) @@ -1783,7 +1819,7 @@ def _try_coerce_args(self, values, other): other_mask = False if isinstance(other, bool): - raise TypeError + raise TypeError("cannot convert bool to a Timedelta") elif is_null_datelike_scalar(other): other = tslib.iNaT other_mask = True @@ -1795,14 +1831,15 @@ def _try_coerce_args(self, values, other): other = Timedelta(other).value elif isinstance(other, timedelta): other = Timedelta(other).value - elif isinstance(other, np.ndarray): - other_mask = isnull(other) + elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): other = other.astype('i8', copy=False).view('i8') - else: - # scalar - other = Timedelta(other) other_mask = isnull(other) - other = other.value + else: + + # coercion issues + # let higher levels handle + raise TypeError("cannot convert {} to a Timedelta".format( + type(other).__name__)) return values, values_mask, other, other_mask @@ -1996,15 +2033,6 @@ def _maybe_downcast(self, blocks, downcast=None): def _can_hold_element(self, element): return True - def _try_coerce_args(self, values, other): - """ provide coercion to our input arguments """ - - if isinstance(other, ABCDatetimeIndex): - # to store DatetimeTZBlock as object - other = other.asobject.values - - return values, False, other, False - def _try_cast(self, element): return element @@ -2299,11 +2327,33 @@ def _astype(self, dtype, mgr=None, **kwargs): return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs) def _can_hold_element(self, element): + """ + boolean if we can hold this element, will raise on a + tz-aware datetime + """ + if is_list_like(element): + + # we cannot hold tz-aware + # higher level to handle + if not getattr(element, 'tz', None) is not None: + return False + element = np.array(element) return element.dtype == _NS_DTYPE or element.dtype == np.int64 - return (is_integer(element) or isinstance(element, datetime) or - isnull(element)) + + if is_integer(element) or isnull(element): + return True + + if isinstance(element, datetime): + + # we cannot hold tz-aware + if getattr(element, 'tzinfo', None) is not None: + return False + + return True + + return False def _try_cast(self, element): try: @@ -2333,7 +2383,7 @@ def _try_coerce_args(self, values, other): other_mask = False if isinstance(other, bool): - raise TypeError + raise TypeError("cannot convert a bool to a Datetime") elif is_null_datelike_scalar(other): other = tslib.iNaT other_mask = True @@ -2344,26 +2394,34 @@ def _try_coerce_args(self, values, other): "naive Block") other_mask = isnull(other) other = other.asm8.view('i8') - else: - try: - other = np.asarray(other) - other_mask = isnull(other) + elif hasattr(other, 'dtype') and is_datetime64_dtype(other): + if is_datetime64tz_dtype(other): + raise TypeError("cannot coerce a Timestamp with a tz on a " + "naive Block") + other = other.view('i8') + other_mask = isnull(other) - other = other.astype('i8', copy=False).view('i8') - except ValueError: + else: - # coercion issues - # let higher levels handle - raise TypeError + # coercion issues + # let higher levels handle + raise TypeError("cannot convert a {} to a Datetime".format( + type(other).__name__)) return values, values_mask, other, other_mask def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): - if result.dtype.kind in ['i', 'f', 'O']: + if result.dtype.kind in ['i', 'f']: + result = result.astype('M8[ns]') + elif result.dtype.kind in ['O']: try: - result = result.astype('M8[ns]') + + # PITA + # we could have mixed naive & tz-aware + from pandas import to_datetime + result = to_datetime(result, box=False) except ValueError: pass elif isinstance(result, (np.integer, np.float, np.datetime64)): @@ -2501,7 +2559,7 @@ def _try_coerce_args(self, values, other): other_mask = isnull(other) if isinstance(other, bool): - raise TypeError + raise TypeError("cannot convert a bool to a tz-aware Datetime") elif (is_null_datelike_scalar(other) or (is_scalar(other) and isnull(other))): other = tslib.iNaT @@ -2520,8 +2578,21 @@ def _try_coerce_args(self, values, other): raise ValueError("incompatible or non tz-aware value") other_mask = isnull(other) other = other.value + elif hasattr(other, 'dtype'): + tz = getattr(other, 'tz', None) + if tz is None or str(tz) != str(self.values.tz): + raise ValueError("incompatible or non tz-aware value") + other_mask = isnull(other) + other = other.view('i8') else: - raise TypeError + + if is_null_datelike_scalar(other): + other_mask = True + else: + # higher level to coerce + raise TypeError( + "cannot convert a {} to a tz-aware Datetime".format( + type(other).__name__)) return values, values_mask, other, other_mask @@ -4851,6 +4922,7 @@ def _putmask_smart(v, m, n): pass # change the dtype + n = np.asarray(n) dtype, _ = maybe_promote(n.dtype) if is_extension_type(v.dtype) and is_object_dtype(dtype): @@ -4861,9 +4933,20 @@ def _putmask_smart(v, m, n): try: nv[m] = n[m] except ValueError: + idx, = np.where(np.squeeze(m)) for mask_index, new_val in zip(idx, n[m]): nv[mask_index] = new_val + + except IndexError: + + # we mismatch in the assignment + # length; instead n matches the masked values + if len(nv[m]) != len(n): + raise ValueError("cannot assign mismatch length to masked array") + + nv[m] = n + return nv diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 5a47258fa782e..3b8be05bfc4a7 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -8,6 +8,7 @@ import warnings from pandas.core.dtypes.cast import ( infer_dtype_from_scalar, + cast_scalar_to_array, maybe_cast_item) from pandas.core.dtypes.common import ( is_integer, is_list_like, @@ -178,8 +179,8 @@ def _init_data(self, data, copy, dtype, **kwargs): copy = False dtype = None elif is_scalar(data) and all(x is not None for x in passed_axes): - values = _cast_scalar_to_array([len(x) for x in passed_axes], - data, dtype=dtype) + values = cast_scalar_to_array([len(x) for x in passed_axes], + data, dtype=dtype) mgr = self._init_matrix(values, passed_axes, dtype=values.dtype, copy=False) copy = False @@ -580,7 +581,7 @@ def __setitem__(self, key, value): shape[1:], tuple(map(int, value.shape)))) mat = np.asarray(value) elif is_scalar(value): - mat = _cast_scalar_to_array(shape[1:], value) + mat = cast_scalar_to_array(shape[1:], value) else: raise TypeError('Cannot set item of type: %s' % str(type(value))) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 3f4cd0c0482ba..0b23b662b2a65 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -9,7 +9,7 @@ from datetime import datetime, timedelta, date import numpy as np -from pandas import Timedelta, Timestamp, DatetimeIndex +from pandas import Period, Timedelta, Timestamp, DatetimeIndex from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, @@ -18,7 +18,8 @@ infer_dtype_from_array, maybe_convert_string_to_object, maybe_convert_scalar, - find_common_type) + find_common_type, + cast_scalar_to_array) from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -131,8 +132,28 @@ def test_infer_dtype_from_scalar(self): dtype, val = infer_dtype_from_scalar(data) assert dtype == 'm8[ns]' - for data in [date(2000, 1, 1), - Timestamp(1, tz='US/Eastern'), 'foo']: + for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo']: + dt = Timestamp(1, tz=tz) + dtype, val = infer_dtype_from_scalar(dt, pandas_dtype=True) + assert dtype == 'datetime64[ns, {0}]'.format(tz) + assert val == dt.value + + dtype, val = infer_dtype_from_scalar(dt) + assert dtype == np.object_ + assert val == dt + + for freq in ['M', 'D']: + p = Period('2011-01-01', freq=freq) + dtype, val = infer_dtype_from_scalar(p, pandas_dtype=True) + assert dtype == 'period[{0}]'.format(freq) + assert val == p.ordinal + + dtype, val = infer_dtype_from_scalar(p) + dtype == np.object_ + assert val == p + + # misc + for data in [date(2000, 1, 1), 'foo']: dtype, val = infer_dtype_from_scalar(data) assert dtype == np.object_ @@ -155,6 +176,34 @@ def test_infer_dtype_from_array(self, arr, expected): dtype, _ = infer_dtype_from_array(arr) assert dtype == expected + def test_cast_scalar_to_array(self): + arr = cast_scalar_to_array((3, 2), 1, dtype=np.int64) + exp = np.ones((3, 2), dtype=np.int64) + tm.assert_numpy_array_equal(arr, exp) + + arr = cast_scalar_to_array((3, 2), 1.1) + exp = np.empty((3, 2), dtype=np.float64) + exp.fill(1.1) + tm.assert_numpy_array_equal(arr, exp) + + arr = cast_scalar_to_array((2, 3), Timestamp('2011-01-01')) + exp = np.empty((2, 3), dtype='datetime64[ns]') + exp.fill(np.datetime64('2011-01-01')) + tm.assert_numpy_array_equal(arr, exp) + + # pandas dtype is stored as object dtype + obj = Timestamp('2011-01-01', tz='US/Eastern') + arr = cast_scalar_to_array((2, 3), obj) + exp = np.empty((2, 3), dtype=np.object) + exp.fill(obj) + tm.assert_numpy_array_equal(arr, exp) + + obj = Period('2011-01-01', freq='D') + arr = cast_scalar_to_array((2, 3), obj) + exp = np.empty((2, 3), dtype=np.object) + exp.fill(obj) + tm.assert_numpy_array_equal(arr, exp) + class TestMaybe(object): @@ -321,100 +370,3 @@ def test_period_dtype(self): np.dtype('datetime64[ns]'), np.object, np.int64]: assert find_common_type([dtype, dtype2]) == np.object assert find_common_type([dtype2, dtype]) == np.object - - -class TestInferDtype2(tm.TestCase): - - def test_infer_dtype_from_scalar(self): - # Test that _infer_dtype_from_scalar is returning correct dtype for int - # and float. - - for dtypec in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, - np.int32, np.uint64, np.int64]: - data = dtypec(12) - dtype, val = _infer_dtype_from_scalar(data) - self.assertEqual(dtype, type(data)) - - data = 12 - dtype, val = _infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.int64) - - for dtypec in [np.float16, np.float32, np.float64]: - data = dtypec(12) - dtype, val = _infer_dtype_from_scalar(data) - self.assertEqual(dtype, dtypec) - - data = np.float(12) - dtype, val = _infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.float64) - - for data in [True, False]: - dtype, val = _infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.bool_) - - for data in [np.complex64(1), np.complex128(1)]: - dtype, val = _infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.complex_) - - import datetime - for data in [np.datetime64(1, 'ns'), Timestamp(1), - datetime.datetime(2000, 1, 1, 0, 0)]: - dtype, val = _infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'M8[ns]') - - for data in [np.timedelta64(1, 'ns'), Timedelta(1), - datetime.timedelta(1)]: - dtype, val = _infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'm8[ns]') - - for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo']: - dt = Timestamp(1, tz=tz) - dtype, val = _infer_dtype_from_scalar(dt, pandas_dtype=True) - self.assertEqual(dtype, 'datetime64[ns, {0}]'.format(tz)) - self.assertEqual(val, dt.value) - - dtype, val = _infer_dtype_from_scalar(dt) - self.assertEqual(dtype, np.object_) - self.assertEqual(val, dt) - - for freq in ['M', 'D']: - p = Period('2011-01-01', freq=freq) - dtype, val = _infer_dtype_from_scalar(p, pandas_dtype=True) - self.assertEqual(dtype, 'period[{0}]'.format(freq)) - self.assertEqual(val, p.ordinal) - - dtype, val = _infer_dtype_from_scalar(p) - self.assertEqual(dtype, np.object_) - self.assertEqual(val, p) - - for data in [datetime.date(2000, 1, 1), 'foo']: - dtype, val = _infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.object_) - - def test_cast_scalar_to_array(self): - arr = _cast_scalar_to_array((3, 2), 1, dtype=np.int64) - exp = np.ones((3, 2), dtype=np.int64) - tm.assert_numpy_array_equal(arr, exp) - - arr = _cast_scalar_to_array((3, 2), 1.1) - exp = np.empty((3, 2), dtype=np.float64) - exp.fill(1.1) - tm.assert_numpy_array_equal(arr, exp) - - arr = _cast_scalar_to_array((2, 3), Timestamp('2011-01-01')) - exp = np.empty((2, 3), dtype='datetime64[ns]') - exp.fill(np.datetime64('2011-01-01')) - tm.assert_numpy_array_equal(arr, exp) - - # pandas dtype is stored as object dtype - obj = Timestamp('2011-01-01', tz='US/Eastern') - arr = _cast_scalar_to_array((2, 3), obj) - exp = np.empty((2, 3), dtype=np.object) - exp.fill(obj) - tm.assert_numpy_array_equal(arr, exp) - - obj = Period('2011-01-01', freq='D') - arr = _cast_scalar_to_array((2, 3), obj) - exp = np.empty((2, 3), dtype=np.object) - exp.fill(obj) - tm.assert_numpy_array_equal(arr, exp) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 8ec6c6e6263d8..700cd8ea720f6 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -188,6 +188,7 @@ def test_timestamp_compare(self): df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', 'ne': 'ne'} + for left, right in ops.items(): left_f = getattr(operator, left) right_f = getattr(operator, right) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 92d06950e61d2..43933f68986af 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -108,7 +108,7 @@ def test_setitem_series_int64(self): def test_setitem_series_int8(self): # integer dtype coercion (no change) obj = pd.Series([1, 2, 3, 4], dtype=np.int8) - self.assertEqual(obj.dtype, np.int8) + assert obj.dtype == np.int8 exp = pd.Series([1, 1, 3, 4], dtype=np.int8) self._assert_setitem_series_conversion(obj, np.int32(1), exp, np.int8) @@ -211,12 +211,11 @@ def test_setitem_series_datetime64(self): exp, 'datetime64[ns]') # datetime64 + int -> object - # ToDo: The result must be object exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp(1), + 1, pd.Timestamp('2011-01-03'), pd.Timestamp('2011-01-04')]) - self._assert_setitem_series_conversion(obj, 1, exp, 'datetime64[ns]') + self._assert_setitem_series_conversion(obj, 1, exp, np.object) # datetime64 + object -> object exp = pd.Series([pd.Timestamp('2011-01-01'), @@ -225,8 +224,6 @@ def test_setitem_series_datetime64(self): pd.Timestamp('2011-01-04')]) self._assert_setitem_series_conversion(obj, 'x', exp, np.object) - # ToDo: add more tests once the above issue has been fixed - def test_setitem_series_datetime64tz(self): tz = 'US/Eastern' obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz), @@ -267,14 +264,12 @@ def test_setitem_series_datetime64tz(self): pd.Timestamp('2011-01-04', tz=tz)]) self._assert_setitem_series_conversion(obj, 1, exp, np.object) - # ToDo: add more tests once the above issue has been fixed - def test_setitem_series_timedelta64(self): obj = pd.Series([pd.Timedelta('1 day'), pd.Timedelta('2 day'), pd.Timedelta('3 day'), pd.Timedelta('4 day')]) - self.assertEqual(obj.dtype, 'timedelta64[ns]') + assert obj.dtype == 'timedelta64[ns]' # timedelta64 + timedelta64 -> timedelta64 exp = pd.Series([pd.Timedelta('1 day'), @@ -285,12 +280,11 @@ def test_setitem_series_timedelta64(self): exp, 'timedelta64[ns]') # timedelta64 + int -> object - # ToDo: The result must be object exp = pd.Series([pd.Timedelta('1 day'), - pd.Timedelta(1), + 1, pd.Timedelta('3 day'), pd.Timedelta('4 day')]) - self._assert_setitem_series_conversion(obj, 1, exp, 'timedelta64[ns]') + self._assert_setitem_series_conversion(obj, 1, exp, np.object) # timedelta64 + object -> object exp = pd.Series([pd.Timedelta('1 day'), @@ -299,8 +293,7 @@ def test_setitem_series_timedelta64(self): pd.Timedelta('4 day')]) self._assert_setitem_series_conversion(obj, 'x', exp, np.object) - # ToDo: add more tests once the above issue has been fixed - + @pytest.mark.xfail(reason="add some tests for me") def test_setitem_series_period(self): pass @@ -839,12 +832,15 @@ def test_where_series_datetime64(self): pd.Timestamp('2012-01-04')]) self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') - # ToDo: coerce to object - msg = "cannot coerce a Timestamp with a tz on a naive Block" - with tm.assert_raises_regex(TypeError, msg): - obj.where(cond, pd.Timestamp('2012-01-01', tz='US/Eastern')) + # datetime64 + datetime64tz -> object + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-01', tz='US/Eastern')]) + values = pd.Timestamp('2012-01-01', tz='US/Eastern') + self._assert_where_conversion(obj, cond, values, exp, np.object) - # ToDo: do not coerce to UTC, must be object + # TODO: do not coerce to UTC, must be object values = pd.Series([pd.Timestamp('2012-01-01', tz='US/Eastern'), pd.Timestamp('2012-01-02', tz='US/Eastern'), pd.Timestamp('2012-01-03', tz='US/Eastern'), @@ -1045,6 +1041,7 @@ def test_fillna_series_datetime64(self): pd.Timestamp('2011-01-03'), pd.Timestamp('2011-01-04')]) value = pd.Timestamp('2012-01-01', tz='US/Eastern') + self._assert_fillna_conversion(obj, value, exp, np.object) # datetime64 + int => object @@ -1077,6 +1074,7 @@ def test_fillna_series_datetime64tz(self): pd.Timestamp('2011-01-03', tz=tz), pd.Timestamp('2011-01-04', tz=tz)]) value = pd.Timestamp('2012-01-01', tz=tz) + self._assert_fillna_conversion(obj, value, exp, 'datetime64[ns, US/Eastern]') diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index da8a896cb6f4a..af162363084ff 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -1,5 +1,3 @@ -import pytest - import numpy as np import pandas as pd from pandas import date_range, Index, DataFrame, Series, Timestamp @@ -56,10 +54,10 @@ def test_indexing_with_datetime_tz(self): 'US/Pacific') # trying to set a single element on a part of a different timezone - def f(): - df.loc[df.new_col == 'new', 'time'] = v - - pytest.raises(ValueError, f) + df2 = df.copy() + assert df2.time.dtype == 'datetime64[ns, UTC]' + df2.loc[df2.new_col == 'new', 'time'] = v + assert df2.time.dtype == 'object' v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') df.loc[df.new_col == 'new', 'time'] = v diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index c3fb35b56a9a7..3f9b4146f1616 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -358,10 +358,11 @@ def test_multi_assign(self): # GH 14001 expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], 'PF': [0, 0, 0, 0, 1, 1], - 'col1': [0., 1., 4., 6., 8., 10.], + 'col1': [0, 1, 4, 6, 8, 10], 'col2': [12, 7, 16, np.nan, 20, 22]}) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values + tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 6ded4d593a571..598a6cb8a5ef0 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -1360,14 +1360,16 @@ def test_where_dups(self): expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2]) assert_series_equal(comb, expected) - def test_where_datetime(self): + def test_where_datetime_coerce(self): + s = Series(date_range('20130102', periods=2)) - expected = Series([10, 10], dtype='datetime64[ns]') + expected = Series([10, 10], dtype='object') mask = np.array([False, False]) rs = s.where(mask, [10, 10]) assert_series_equal(rs, expected) + # convert to object as we are passing non-datetime64 rs = s.where(mask, 10) assert_series_equal(rs, expected) @@ -1378,7 +1380,7 @@ def test_where_datetime(self): assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='datetime64[ns]') + expected = Series([10, None], dtype=object) assert_series_equal(rs, expected) # GH 15701 @@ -1389,9 +1391,9 @@ def test_where_datetime(self): expected = Series([pd.NaT, s[1]]) assert_series_equal(rs, expected) - def test_where_timedelta(self): + def test_where_timedelta_coerce(self): s = Series([1, 2], dtype='timedelta64[ns]') - expected = Series([10, 10], dtype='timedelta64[ns]') + expected = Series([10, 10], dtype=object) mask = np.array([False, False]) rs = s.where(mask, [10, 10]) @@ -1407,9 +1409,38 @@ def test_where_timedelta(self): assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='timedelta64[ns]') + expected = Series([10, None], dtype=object) assert_series_equal(rs, expected) + def test_where_consistency(self): + + # 16402 + # where should be consisten across various functions + s = Series([Timestamp('20130101'), pd.NaT]) + + # this is currently wrong :<, should be object + result = s.fillna(Timestamp('20130101', tz='US/Eastern')) + expected = Series([Timestamp('2012-12-31 19:00:00'), + Timestamp('2013-01-01 00:00:00')] + ).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + + result = s.fillna('foo') + expected = Series([Timestamp('20130101'), 'foo']) + assert_series_equal(result, expected) + + s2 = s.copy() + s2[1] = 'bar' + expected = Series([Timestamp('20130101'), 'bar']) + assert_series_equal(s2, expected) + + # see 16406 for constrution bug + result = s.where([True, False], Timestamp('20130101', tz='US/Eastern')) + expected = Series([Timestamp('20130101'), + Timestamp('20130101', tz='US/Eastern')], + dtype=object) + assert_series_equal(result, expected) + def test_mask(self): # compare with tested results in test_where s = Series(np.random.randn(5))