rebase & cleanup, fixup some edge cases

closes pandas-dev#16402 xref to pandas-dev#12747
jreback · May 31, 2017 · 0290308 · 0290308
1 parent fdd19a5
commit 0290308
Show file tree

Hide file tree

Showing 20 changed files with 744 additions and 610 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -50,6 +50,19 @@ Backwards incompatible API changes
 - Accessing a non-existent attribute on a closed :class:`HDFStore` will now
   raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
 
+.. _whatsnew_0210.dtype_conversions:
+
+Dtype Conversions
+^^^^^^^^^^^^^^^^^
+
+Example about setitem / where with bools.
+
+
+
+- Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`)
+- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
+- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
+
 
 .. _whatsnew_0210.api:
 
@@ -88,9 +101,6 @@ Bug Fixes
 Conversion
 ^^^^^^^^^^
 
-- Bug in assignment against datetime-like data with ``int`` may incorrectly converted to datetime-like (:issue:`14145`)
-- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
-
 
 
 Indexing

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -19,6 +19,7 @@ cimport tslib
 from hashtable cimport *
 from pandas._libs import tslib, algos, hashtable as _hash
 from pandas._libs.tslib import Timestamp, Timedelta
+from datetime import datetime, timedelta
 
 from datetime cimport (get_datetime64_value, _pydatetime_to_dts,
                        pandas_datetimestruct)
@@ -507,24 +508,37 @@ cdef class TimedeltaEngine(DatetimeEngine):
         return 'm8[ns]'
 
 cpdef convert_scalar(ndarray arr, object value):
+    # we don't turn integers
+    # into datetimes/timedeltas
+
+    # we don't turn bools into int/float/complex
+
     if arr.descr.type_num == NPY_DATETIME:
         if isinstance(value, np.ndarray):
             pass
-        elif isinstance(value, Timestamp):
-            return value.value
+        elif isinstance(value, datetime):
+            return Timestamp(value).value
         elif value is None or value != value:
             return iNaT
-        else:
+        elif util.is_string_object(value):
             return Timestamp(value).value
+        raise ValueError("cannot set a Timestamp with a non-timestamp")
+
     elif arr.descr.type_num == NPY_TIMEDELTA:
         if isinstance(value, np.ndarray):
             pass
-        elif isinstance(value, Timedelta):
-            return value.value
+        elif isinstance(value, timedelta):
+            return Timedelta(value).value
         elif value is None or value != value:
             return iNaT
-        else:
+        elif util.is_string_object(value):
             return Timedelta(value).value
+        raise ValueError("cannot set a Timedelta with a non-timedelta")
+
+    if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and not
+            issubclass(arr.dtype.type, np.bool_)):
+        if util.is_bool_object(value):
+            raise ValueError('Cannot assign bool to float/integer series')
 
     if issubclass(arr.dtype.type, (np.integer, np.bool_)):
         if util.is_float_object(value) and value != value:

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -6,7 +6,7 @@
 import warnings
 
 from pandas._libs import tslib, lib
-from pandas._libs.tslib import iNaT
+from pandas._libs.tslib import iNaT, Timestamp
 from pandas.compat import string_types, text_type, PY3
 from .common import (_ensure_object, is_bool, is_integer, is_float,
                      is_complex, is_datetimetz, is_categorical_dtype,
@@ -333,6 +333,23 @@ def maybe_promote(dtype, fill_value=np.nan):
     return dtype, fill_value
 
 
+def infer_dtype_from(val, pandas_dtype=False):
+    """
+    interpret the dtype from a scalar or array. This is a convenience
+    routines to infer dtype from a scalar or an array
+
+    Parameters
+    ----------
+    pandas_dtype : bool, default False
+        whether to infer dtype including pandas extension types.
+        If False, scalar/array belongs to pandas extension types is inferred as
+        object
+    """
+    if is_scalar(val):
+        return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
+    return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
+
+
 def infer_dtype_from_scalar(val, pandas_dtype=False):
     """
     interpret the dtype from a scalar
@@ -408,24 +425,32 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
     return dtype, val
 
 
-def infer_dtype_from_array(arr):
+def infer_dtype_from_array(arr, pandas_dtype=False):
     """
     infer the dtype from a scalar or array
 
     Parameters
     ----------
     arr : scalar or array
+    pandas_dtype : bool, default False
+        whether to infer dtype including pandas extension types.
+        If False, array belongs to pandas extension types
+        is inferred as object
 
     Returns
     -------
-    tuple (numpy-compat dtype, array)
+    tuple (numpy-compat/pandas-compat dtype, array)
 
     Notes
     -----
-    These infer to numpy dtypes exactly
-    with the exception that mixed / object dtypes
+
+    if pandas_dtype=False. these infer to numpy dtypes
+    exactly with the exception that mixed / object dtypes
     are not coerced by stringifying or conversion
 
+    if pandas_dtype=True. datetime64tz-aware/categorical
+    types will retain there character.
+
     Examples
     --------
     >>> np.asarray([1, '1'])
@@ -442,6 +467,10 @@ def infer_dtype_from_array(arr):
     if not is_list_like(arr):
         arr = [arr]
 
+    if pandas_dtype and (is_categorical_dtype(arr) or
+                         is_datetime64tz_dtype(arr)):
+        return arr.dtype, arr
+
     # don't force numpy coerce with nan's
     inferred = lib.infer_dtype(arr)
     if inferred in ['string', 'bytes', 'unicode',
@@ -1028,13 +1057,25 @@ def find_common_type(types):
     return np.find_common_type(types, [])
 
 
-def _cast_scalar_to_array(shape, value, dtype=None):
+def cast_scalar_to_array(shape, value, dtype=None):
     """
     create np.ndarray of specified shape and dtype, filled with values
+
+    Parameters
+    ----------
+    shape : tuple
+    value : scalar value
+    dtype : np.dtype, optional
+        dtype to coerce
+
+    Returns
+    -------
+    ndarray of shape, filled with value, of specified / inferred dtype
+
     """
 
     if dtype is None:
-        dtype, fill_value = _infer_dtype_from_scalar(value)
+        dtype, fill_value = infer_dtype_from_scalar(value)
     else:
         fill_value = value
 

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -11,7 +11,8 @@
                      ExtensionDtype)
 from .generic import (ABCCategorical, ABCPeriodIndex,
                       ABCDatetimeIndex, ABCSeries,
-                      ABCSparseArray, ABCSparseSeries)
+                      ABCSparseArray, ABCSparseSeries,
+                      ABCIndexClass)
 from .inference import is_string_like
 from .inference import *  # noqa
 
@@ -1535,11 +1536,22 @@ def is_bool_dtype(arr_or_dtype):
 
     if arr_or_dtype is None:
         return False
+
     try:
         tipo = _get_dtype_type(arr_or_dtype)
     except ValueError:
         # this isn't even a dtype
         return False
+
+    if isinstance(arr_or_dtype, ABCIndexClass):
+
+        # TODO(jreback)
+        # we don't have a boolean Index class
+        # so its object, we need to infer to
+        # guess this
+        return (arr_or_dtype.is_object and
+                arr_or_dtype.inferred_type == 'boolean')
+
     return issubclass(tipo, np.bool_)
 
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -25,14 +25,15 @@
 import numpy.ma as ma
 
 from pandas.core.dtypes.cast import (
-    maybe_upcast, infer_dtype_from_scalar,
+    maybe_upcast,
     maybe_cast_to_datetime,
     maybe_infer_to_datetimelike,
     maybe_convert_platform,
     maybe_downcast_to_dtype,
     invalidate_string_dtypes,
     coerce_to_dtypes,
     maybe_upcast_putmask,
+    cast_scalar_to_array,
     find_common_type)
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
@@ -356,8 +357,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
                 raise_with_traceback(exc)
 
             if arr.ndim == 0 and index is not None and columns is not None:
-                values = _cast_scalar_to_array((len(index), len(columns)),
-                                               data, dtype=dtype)
+                values = cast_scalar_to_array((len(index), len(columns)),
+                                              data, dtype=dtype)
                 mgr = self._init_ndarray(values, index, columns,
                                          dtype=values.dtype, copy=False)
             else:
@@ -2649,8 +2650,8 @@ def reindexer(value):
 
         else:
             # upcast the scalar
-            value = _cast_scalar_to_array(len(self.index), value)
-            value = _possibly_cast_to_datetime(value, value.dtype)
+            value = cast_scalar_to_array(len(self.index), value)
+            value = maybe_cast_to_datetime(value, value.dtype)
 
         # return internal types directly
         if is_extension_type(value):

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -13,7 +13,6 @@
 from pandas.core.dtypes.common import (
     _ensure_int64,
     _ensure_object,
-    needs_i8_conversion,
     is_scalar,
     is_number,
     is_integer, is_bool,
@@ -5301,48 +5300,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
                 raise NotImplemented("cannot align with a higher dimensional "
                                      "NDFrame")
 
-        elif is_list_like(other):
-
-            if self.ndim == 1:
-
-                # try to set the same dtype as ourselves
-                try:
-                    new_other = np.array(other, dtype=self.dtype)
-                except ValueError:
-                    new_other = np.array(other)
-                except TypeError:
-                    new_other = other
-
-                # we can end up comparing integers and m8[ns]
-                # which is a numpy no no
-                is_i8 = needs_i8_conversion(self.dtype)
-                if is_i8:
-                    matches = False
-                else:
-                    matches = (new_other == np.array(other))
-
-                if matches is False or not matches.all():
-
-                    # coerce other to a common dtype if we can
-                    if needs_i8_conversion(self.dtype):
-                        try:
-                            other = np.array(other, dtype=self.dtype)
-                        except:
-                            other = np.array(other)
-                    else:
-                        other = np.asarray(other)
-                        other = np.asarray(other,
-                                           dtype=np.common_type(other,
-                                                                new_other))
-
-                    # we need to use the new dtype
-                    try_quick = False
-                else:
-                    other = new_other
-            else:
-
-                other = np.array(other)
-
         if isinstance(other, np.ndarray):
 
             if other.shape != self.shape:
@@ -5407,7 +5364,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
             # reconstruct the block manager
 
             self._check_inplace_setting(other)
-            new_data = self._data.putmask(mask=cond, new=other, align=align,
+            new_data = self._data.putmask(mask=cond, other=other, align=align,
                                           inplace=True, axis=block_axis,
                                           transpose=self._AXIS_REVERSED)
             self._update_inplace(new_data)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -22,6 +22,7 @@
     _ensure_platform_int,
     is_integer,
     is_float,
+    is_bool,
     is_dtype_equal,
     is_object_dtype,
     is_categorical_dtype,
@@ -608,11 +609,21 @@ def repeat(self, repeats, *args, **kwargs):
 
     @Appender(_index_shared_docs['where'])
     def where(self, cond, other=None):
+
         if other is None:
             other = self._na_value
-        values = np.where(cond, self.values, other)
 
         dtype = self.dtype
+        values = self.values
+
+        if is_bool(other) or is_bool_dtype(other):
+
+            # bools force casting
+            values = values.astype(object)
+            dtype = None
+
+        values = np.where(cond, values, other)
+
         if self._is_numeric_dtype and np.any(isnull(values)):
             # We can't coerce to the numeric dtype of "self" (unless
             # it's float) if there are NaN values in our output.
@@ -1040,6 +1051,7 @@ def _convert_can_do_setop(self, other):
 
     def _convert_for_op(self, value):
         """ Convert value to be insertable to ndarray """
+
         return value
 
     def _assert_can_do_op(self, value):
@@ -3615,6 +3627,7 @@ def fillna(self, value=None, downcast=None):
                 # no need to care metadata other than name
                 # because it can't have freq if
                 return Index(result, name=self.name)
+
         return self._shallow_copy()
 
     _index_shared_docs['dropna'] = """