CLN/BUG: fix ndarray assignment may cause unexpected cast

supersedes pandas-dev#14145 closes pandas-dev#14001
jreback · Jul 4, 2017 · 594434d · 594434d
1 parent 15db50b
commit 594434d
Show file tree

Hide file tree

Showing 11 changed files with 309 additions and 77 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -54,6 +54,7 @@ Backwards incompatible API changes
 - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
 - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
 
+
 .. _whatsnew_0210.api:
 
 Other API Changes
@@ -97,6 +98,9 @@ Bug Fixes
 Conversion
 ^^^^^^^^^^
 
+- Bug in assignment against datetime-like data with ``int`` may incorrectly converted to datetime-like (:issue:`14145`)
+- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
+
 
 
 Indexing

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -272,7 +272,7 @@ def maybe_promote(dtype, fill_value=np.nan):
         else:
             if issubclass(dtype.type, np.datetime64):
                 try:
-                    fill_value = lib.Timestamp(fill_value).value
+                    fill_value = Timestamp(fill_value).value
                 except:
                     # the proper thing to do here would probably be to upcast
                     # to object (but numpy 1.6.1 doesn't do this properly)
@@ -349,9 +349,9 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
 
     # a 1-element ndarray
     if isinstance(val, np.ndarray):
+        msg = "invalid ndarray passed to _infer_dtype_from_scalar"
         if val.ndim != 0:
-            raise ValueError(
-                "invalid ndarray passed to _infer_dtype_from_scalar")
+            raise ValueError(msg)
 
         dtype = val.dtype
         val = val.item()
@@ -552,7 +552,7 @@ def conv(r, dtype):
             if isnull(r):
                 pass
             elif dtype == _NS_DTYPE:
-                r = lib.Timestamp(r)
+                r = Timestamp(r)
             elif dtype == _TD_DTYPE:
                 r = _coerce_scalar_to_timedelta_type(r)
             elif dtype == np.bool_:
@@ -1026,3 +1026,19 @@ def find_common_type(types):
             return np.object
 
     return np.find_common_type(types, [])
+
+
+def _cast_scalar_to_array(shape, value, dtype=None):
+    """
+    create np.ndarray of specified shape and dtype, filled with values
+    """
+
+    if dtype is None:
+        dtype, fill_value = _infer_dtype_from_scalar(value)
+    else:
+        fill_value = value
+
+    values = np.empty(shape, dtype=dtype)
+    values.fill(fill_value)
+
+    return values
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -59,6 +59,7 @@
     is_named_tuple)
 from pandas.core.dtypes.missing import isnull, notnull
 
+
 from pandas.core.common import (_try_sort,
                                 _default_index,
                                 _values_from_object,
@@ -385,15 +386,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
                 raise_with_traceback(exc)
 
             if arr.ndim == 0 and index is not None and columns is not None:
-                if isinstance(data, compat.string_types) and dtype is None:
-                    dtype = np.object_
-                if dtype is None:
-                    dtype, data = infer_dtype_from_scalar(data)
-
-                values = np.empty((len(index), len(columns)), dtype=dtype)
-                values.fill(data)
-                mgr = self._init_ndarray(values, index, columns, dtype=dtype,
-                                         copy=False)
+                values = _cast_scalar_to_array((len(index), len(columns)),
+                                               data, dtype=dtype)
+                mgr = self._init_ndarray(values, index, columns,
+                                         dtype=values.dtype, copy=False)
             else:
                 raise ValueError('DataFrame constructor not properly called!')
 
@@ -507,7 +503,7 @@ def _get_axes(N, K, index=index, columns=columns):
         values = _prep_ndarray(values, copy=copy)
 
         if dtype is not None:
-            if values.dtype != dtype:
+            if not is_dtype_equal(values.dtype, dtype):
                 try:
                     values = values.astype(dtype)
                 except Exception as orig:
@@ -2683,9 +2679,8 @@ def reindexer(value):
 
         else:
             # upcast the scalar
-            dtype, value = infer_dtype_from_scalar(value)
-            value = np.repeat(value, len(self.index)).astype(dtype)
-            value = maybe_cast_to_datetime(value, dtype)
+            value = _cast_scalar_to_array(len(self.index), value)
+            value = _possibly_cast_to_datetime(value, value.dtype)
 
         # return internal types directly
         if is_extension_type(value):

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -388,7 +388,8 @@ def fillna(self, value, limit=None, inplace=False, downcast=None,
 
         # fillna, but if we cannot coerce, then try again as an ObjectBlock
         try:
-            values, _, value, _ = self._try_coerce_args(self.values, value)
+            values, _, _, _ = self._try_coerce_args(self.values, value)
+            # value may be converted to internal, thus drop
             blocks = self.putmask(mask, value, inplace=inplace)
             blocks = [b.make_block(values=self._try_coerce_result(b.values))
                       for b in blocks]
@@ -682,8 +683,43 @@ def setitem(self, indexer, value, mgr=None):
             if self.is_numeric:
                 value = np.nan
 
-        # coerce args
-        values, _, value, _ = self._try_coerce_args(self.values, value)
+        # coerce if block dtype can store value
+        values = self.values
+        try:
+            values, _, value, _ = self._try_coerce_args(values, value)
+            # can keep its own dtype
+            if hasattr(value, 'dtype') and is_dtype_equal(values.dtype,
+                                                          value.dtype):
+                dtype = self.dtype
+            else:
+                dtype = 'infer'
+
+        except (TypeError, ValueError):
+            # current dtype cannot store value, coerce to common dtype
+            find_dtype = False
+
+            if hasattr(value, 'dtype'):
+                dtype = value.dtype
+                find_dtype = True
+
+            elif is_scalar(value):
+                if isnull(value):
+                    # NaN promotion is handled in latter path
+                    dtype = False
+                else:
+                    dtype, _ = _infer_dtype_from_scalar(value,
+                                                        pandas_dtype=True)
+                    find_dtype = True
+            else:
+                dtype = 'infer'
+
+            if find_dtype:
+                dtype = _find_common_type([values.dtype, dtype])
+                if not is_dtype_equal(self.dtype, dtype):
+                    b = self.astype(dtype)
+                    return b.setitem(indexer, value, mgr=mgr)
+
+        # value must be storeable at this moment
         arr_value = np.array(value)
 
         # cast the values to a type that can hold nan (if necessary)
@@ -713,19 +749,8 @@ def setitem(self, indexer, value, mgr=None):
                     raise ValueError("cannot set using a slice indexer with a "
                                      "different length than the value")
 
-        try:
-
-            def _is_scalar_indexer(indexer):
-                # return True if we are all scalar indexers
-
-                if arr_value.ndim == 1:
-                    if not isinstance(indexer, tuple):
-                        indexer = tuple([indexer])
-                    return all([is_scalar(idx) for idx in indexer])
-                return False
-
-            def _is_empty_indexer(indexer):
-                # return a boolean if we have an empty indexer
+        def _is_scalar_indexer(indexer):
+            # return True if we are all scalar indexers
 
                 if arr_value.ndim == 1:
                     if not isinstance(indexer, tuple):
@@ -777,23 +802,43 @@ def _is_empty_indexer(indexer):
             raise
         except TypeError:
 
-            # cast to the passed dtype if possible
-            # otherwise raise the original error
-            try:
-                # e.g. we are uint32 and our value is uint64
-                # this is for compat with older numpies
-                block = self.make_block(transf(values.astype(value.dtype)))
-                return block.setitem(indexer=indexer, value=value, mgr=mgr)
+        def _is_empty_indexer(indexer):
+            # return a boolean if we have an empty indexer
 
-            except:
-                pass
-
-            raise
+            if arr_value.ndim == 1:
+                if not isinstance(indexer, tuple):
+                    indexer = tuple([indexer])
+                return any(isinstance(idx, np.ndarray) and len(idx) == 0
+                           for idx in indexer)
+            return False
 
-        except Exception:
+        # empty indexers
+        # 8669 (empty)
+        if _is_empty_indexer(indexer):
             pass
 
-        return [self]
+        # setting a single element for each dim and with a rhs that could
+        # be say a list
+        # GH 6043
+        elif _is_scalar_indexer(indexer):
+            values[indexer] = value
+
+        # if we are an exact match (ex-broadcasting),
+        # then use the resultant dtype
+        elif (len(arr_value.shape) and
+              arr_value.shape[0] == values.shape[0] and
+              np.prod(arr_value.shape) == np.prod(values.shape)):
+            values[indexer] = value
+            values = values.astype(arr_value.dtype)
+
+        # set
+        else:
+            values[indexer] = value
+
+        # coerce and try to infer the dtypes of the result
+        values = self._try_coerce_and_cast_result(values, dtype)
+        block = self.make_block(transf(values), fastpath=True)
+        return block
 
     def putmask(self, mask, new, align=True, inplace=False, axis=0,
                 transpose=False, mgr=None):
@@ -1264,6 +1309,7 @@ def func(cond, values, other):
 
             values, values_mask, other, other_mask = self._try_coerce_args(
                 values, other)
+
             try:
                 return self._try_coerce_result(expressions.where(
                     cond, values, other, raise_on_error=True))
@@ -1543,6 +1589,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
             new = new[mask]
 
         mask = _safe_reshape(mask, new_values.shape)
+
         new_values[mask] = new
         new_values = self._try_coerce_result(new_values)
         return [self.make_block(values=new_values)]
@@ -1712,7 +1759,7 @@ def fillna(self, value, **kwargs):
 
         # allow filling with integers to be
         # interpreted as seconds
-        if not isinstance(value, np.timedelta64) and is_integer(value):
+        if not isinstance(value, np.timedelta64):
             value = Timedelta(value, unit='s')
         return super(TimeDeltaBlock, self).fillna(value, **kwargs)
 
@@ -1949,6 +1996,15 @@ def _maybe_downcast(self, blocks, downcast=None):
     def _can_hold_element(self, element):
         return True
 
+    def _try_coerce_args(self, values, other):
+        """ provide coercion to our input arguments """
+
+        if isinstance(other, ABCDatetimeIndex):
+            # to store DatetimeTZBlock as object
+            other = other.asobject.values
+
+        return values, False, other, False
+
     def _try_cast(self, element):
         return element
 
@@ -2288,8 +2344,6 @@ def _try_coerce_args(self, values, other):
                                 "naive Block")
             other_mask = isnull(other)
             other = other.asm8.view('i8')
-        elif hasattr(other, 'dtype') and is_integer_dtype(other):
-            other = other.view('i8')
         else:
             try:
                 other = np.asarray(other)
@@ -2466,6 +2520,8 @@ def _try_coerce_args(self, values, other):
                 raise ValueError("incompatible or non tz-aware value")
             other_mask = isnull(other)
             other = other.value
+        else:
+            raise TypeError
 
         return values, values_mask, other, other_mask
 

diff --git a/pandas/core/panel.py b/pandas/core/panel.py
@@ -178,11 +178,9 @@ def _init_data(self, data, copy, dtype, **kwargs):
             copy = False
             dtype = None
         elif is_scalar(data) and all(x is not None for x in passed_axes):
-            if dtype is None:
-                dtype, data = infer_dtype_from_scalar(data)
-            values = np.empty([len(x) for x in passed_axes], dtype=dtype)
-            values.fill(data)
-            mgr = self._init_matrix(values, passed_axes, dtype=dtype,
+            values = _cast_scalar_to_array([len(x) for x in passed_axes],
+                                           data, dtype=dtype)
+            mgr = self._init_matrix(values, passed_axes, dtype=values.dtype,
                                     copy=False)
             copy = False
         else:  # pragma: no cover
@@ -582,9 +580,7 @@ def __setitem__(self, key, value):
                                      shape[1:], tuple(map(int, value.shape))))
             mat = np.asarray(value)
         elif is_scalar(value):
-            dtype, value = infer_dtype_from_scalar(value)
-            mat = np.empty(shape[1:], dtype=dtype)
-            mat.fill(value)
+            mat = _cast_scalar_to_array(shape[1:], value)
         else:
             raise TypeError('Cannot set item of type: %s' % str(type(value)))