API: This fixes a number of inconsistencies and API issues

w.r.t. dtype conversions. This is a reprise of pandas-dev#14145 & pandas-dev#16408. This removes some code from the core structures & pushes it to internals, where the primitives are made more consistent. This should all us to be a bit more consistent for pandas2 type things. closes pandas-dev#16402 supersedes pandas-dev#14145 closes pandas-dev#14001
jreback · Jul 4, 2017 · b0727cd · b0727cd
1 parent 80e7869
commit b0727cd
Show file tree

Hide file tree

Showing 16 changed files with 428 additions and 386 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -54,6 +54,18 @@ Backwards incompatible API changes
 - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
 - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
 
+.. _whatsnew_0210.dtype_conversions:
+
+Dtype Conversions
+^^^^^^^^^^^^^^^^^
+
+Example about setitem / where with bools.
+
+
+
+- Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`)
+- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
+- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
 
 .. _whatsnew_0210.api:
 

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -19,6 +19,7 @@ cimport tslib
 from hashtable cimport *
 from pandas._libs import tslib, algos, hashtable as _hash
 from pandas._libs.tslib import Timestamp, Timedelta
+from datetime import datetime, timedelta
 
 from datetime cimport (get_datetime64_value, _pydatetime_to_dts,
                        pandas_datetimestruct)
@@ -507,24 +508,37 @@ cdef class TimedeltaEngine(DatetimeEngine):
         return 'm8[ns]'
 
 cpdef convert_scalar(ndarray arr, object value):
+    # we don't turn integers
+    # into datetimes/timedeltas
+
+    # we don't turn bools into int/float/complex
+
     if arr.descr.type_num == NPY_DATETIME:
         if isinstance(value, np.ndarray):
             pass
-        elif isinstance(value, Timestamp):
-            return value.value
+        elif isinstance(value, datetime):
+            return Timestamp(value).value
         elif value is None or value != value:
             return iNaT
-        else:
+        elif util.is_string_object(value):
             return Timestamp(value).value
+        raise ValueError("cannot set a Timestamp with a non-timestamp")
+
     elif arr.descr.type_num == NPY_TIMEDELTA:
         if isinstance(value, np.ndarray):
             pass
-        elif isinstance(value, Timedelta):
-            return value.value
+        elif isinstance(value, timedelta):
+            return Timedelta(value).value
         elif value is None or value != value:
             return iNaT
-        else:
+        elif util.is_string_object(value):
             return Timedelta(value).value
+        raise ValueError("cannot set a Timedelta with a non-timedelta")
+
+    if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and not
+            issubclass(arr.dtype.type, np.bool_)):
+        if util.is_bool_object(value):
+            raise ValueError('Cannot assign bool to float/integer series')
 
     if issubclass(arr.dtype.type, (np.integer, np.bool_)):
         if util.is_float_object(value) and value != value:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -149,6 +149,12 @@ def _reconstruct_data(values, dtype, original):
         pass
     elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
         values = Index(original)._shallow_copy(values, name=None)
+    elif is_bool_dtype(dtype):
+        values = values.astype(dtype)
+
+        # we only support object dtypes bool Index
+        if isinstance(original, Index):
+            values = values.astype(object)
     elif dtype is not None:
         values = values.astype(dtype)
 

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -272,7 +272,7 @@ def maybe_promote(dtype, fill_value=np.nan):
         else:
             if issubclass(dtype.type, np.datetime64):
                 try:
-                    fill_value = Timestamp(fill_value).value
+                    fill_value = tslib.Timestamp(fill_value).value
                 except:
                     # the proper thing to do here would probably be to upcast
                     # to object (but numpy 1.6.1 doesn't do this properly)
@@ -333,6 +333,23 @@ def maybe_promote(dtype, fill_value=np.nan):
     return dtype, fill_value
 
 
+def infer_dtype_from(val, pandas_dtype=False):
+    """
+    interpret the dtype from a scalar or array. This is a convenience
+    routines to infer dtype from a scalar or an array
+
+    Parameters
+    ----------
+    pandas_dtype : bool, default False
+        whether to infer dtype including pandas extension types.
+        If False, scalar/array belongs to pandas extension types is inferred as
+        object
+    """
+    if is_scalar(val):
+        return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
+    return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
+
+
 def infer_dtype_from_scalar(val, pandas_dtype=False):
     """
     interpret the dtype from a scalar
@@ -408,23 +425,29 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
     return dtype, val
 
 
-def infer_dtype_from_array(arr):
+def infer_dtype_from_array(arr, pandas_dtype=False):
     """
     infer the dtype from a scalar or array
 
     Parameters
     ----------
     arr : scalar or array
+    pandas_dtype : bool, default False
+        whether to infer dtype including pandas extension types.
+        If False, array belongs to pandas extension types
+        is inferred as object
 
     Returns
     -------
-    tuple (numpy-compat dtype, array)
+    tuple (numpy-compat/pandas-compat dtype, array)
 
     Notes
     -----
-    These infer to numpy dtypes exactly
-    with the exception that mixed / object dtypes
-    are not coerced by stringifying or conversion
+    if pandas_dtype=False. these infer to numpy dtypes
+    exactly with the exception that mixed / object dtypes
+
+    if pandas_dtype=True. datetime64tz-aware/categorical
+    types will retain there character.
 
     Examples
     --------
@@ -442,6 +465,13 @@ def infer_dtype_from_array(arr):
     if not is_list_like(arr):
         arr = [arr]
 
+    if pandas_dtype and (is_categorical_dtype(arr) or
+                         is_datetime64tz_dtype(arr)):
+        return arr.dtype, arr
+
+    elif isinstance(arr, ABCSeries):
+        return arr.dtype, np.asarray(arr)
+
     # don't force numpy coerce with nan's
     inferred = lib.infer_dtype(arr)
     if inferred in ['string', 'bytes', 'unicode',
@@ -552,7 +582,7 @@ def conv(r, dtype):
             if isnull(r):
                 pass
             elif dtype == _NS_DTYPE:
-                r = Timestamp(r)
+                r = tslib.Timestamp(r)
             elif dtype == _TD_DTYPE:
                 r = _coerce_scalar_to_timedelta_type(r)
             elif dtype == np.bool_:
@@ -1028,13 +1058,25 @@ def find_common_type(types):
     return np.find_common_type(types, [])
 
 
-def _cast_scalar_to_array(shape, value, dtype=None):
+def cast_scalar_to_array(shape, value, dtype=None):
     """
     create np.ndarray of specified shape and dtype, filled with values
+
+    Parameters
+    ----------
+    shape : tuple
+    value : scalar value
+    dtype : np.dtype, optional
+        dtype to coerce
+
+    Returns
+    -------
+    ndarray of shape, filled with value, of specified / inferred dtype
+
     """
 
     if dtype is None:
-        dtype, fill_value = _infer_dtype_from_scalar(value)
+        dtype, fill_value = infer_dtype_from_scalar(value)
     else:
         fill_value = value
 

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -11,7 +11,8 @@
                      ExtensionDtype)
 from .generic import (ABCCategorical, ABCPeriodIndex,
                       ABCDatetimeIndex, ABCSeries,
-                      ABCSparseArray, ABCSparseSeries)
+                      ABCSparseArray, ABCSparseSeries,
+                      ABCIndexClass)
 from .inference import is_string_like
 from .inference import *  # noqa
 
@@ -1540,6 +1541,16 @@ def is_bool_dtype(arr_or_dtype):
     except ValueError:
         # this isn't even a dtype
         return False
+
+    if isinstance(arr_or_dtype, ABCIndexClass):
+
+        # TODO(jreback)
+        # we don't have a boolean Index class
+        # so its object, we need to infer to
+        # guess this
+        return (arr_or_dtype.is_object and
+                arr_or_dtype.inferred_type == 'boolean')
+
     return issubclass(tipo, np.bool_)
 
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -25,7 +25,8 @@
 import numpy.ma as ma
 
 from pandas.core.dtypes.cast import (
-    maybe_upcast, infer_dtype_from_scalar,
+    maybe_upcast,
+    cast_scalar_to_array,
     maybe_cast_to_datetime,
     maybe_infer_to_datetimelike,
     maybe_convert_platform,
@@ -386,8 +387,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
                 raise_with_traceback(exc)
 
             if arr.ndim == 0 and index is not None and columns is not None:
-                values = _cast_scalar_to_array((len(index), len(columns)),
-                                               data, dtype=dtype)
+                values = cast_scalar_to_array((len(index), len(columns)),
+                                              data, dtype=dtype)
                 mgr = self._init_ndarray(values, index, columns,
                                          dtype=values.dtype, copy=False)
             else:
@@ -2679,8 +2680,8 @@ def reindexer(value):
 
         else:
             # upcast the scalar
-            value = _cast_scalar_to_array(len(self.index), value)
-            value = _possibly_cast_to_datetime(value, value.dtype)
+            value = cast_scalar_to_array(len(self.index), value)
+            value = maybe_cast_to_datetime(value, value.dtype)
 
         # return internal types directly
         if is_extension_type(value):

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -13,7 +13,6 @@
 from pandas.core.dtypes.common import (
     _ensure_int64,
     _ensure_object,
-    needs_i8_conversion,
     is_scalar,
     is_number,
     is_integer, is_bool,
@@ -26,7 +25,8 @@
     is_dict_like,
     is_re_compilable,
     pandas_dtype)
-from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
+from pandas.core.dtypes.cast import (
+    maybe_promote, maybe_upcast_putmask)
 from pandas.core.dtypes.missing import isnull, notnull
 from pandas.core.dtypes.generic import ABCSeries, ABCPanel
 
@@ -5336,48 +5336,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
                 raise NotImplementedError("cannot align with a higher "
                                           "dimensional NDFrame")
 
-        elif is_list_like(other):
-
-            if self.ndim == 1:
-
-                # try to set the same dtype as ourselves
-                try:
-                    new_other = np.array(other, dtype=self.dtype)
-                except ValueError:
-                    new_other = np.array(other)
-                except TypeError:
-                    new_other = other
-
-                # we can end up comparing integers and m8[ns]
-                # which is a numpy no no
-                is_i8 = needs_i8_conversion(self.dtype)
-                if is_i8:
-                    matches = False
-                else:
-                    matches = (new_other == np.array(other))
-
-                if matches is False or not matches.all():
-
-                    # coerce other to a common dtype if we can
-                    if needs_i8_conversion(self.dtype):
-                        try:
-                            other = np.array(other, dtype=self.dtype)
-                        except:
-                            other = np.array(other)
-                    else:
-                        other = np.asarray(other)
-                        other = np.asarray(other,
-                                           dtype=np.common_type(other,
-                                                                new_other))
-
-                    # we need to use the new dtype
-                    try_quick = False
-                else:
-                    other = new_other
-            else:
-
-                other = np.array(other)
-
         if isinstance(other, np.ndarray):
 
             if other.shape != self.shape:

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -26,6 +26,7 @@
     is_object_dtype,
     is_categorical_dtype,
     is_interval_dtype,
+    is_bool,
     is_bool_dtype,
     is_signed_integer_dtype,
     is_unsigned_integer_dtype,
@@ -610,9 +611,18 @@ def repeat(self, repeats, *args, **kwargs):
     def where(self, cond, other=None):
         if other is None:
             other = self._na_value
-        values = np.where(cond, self.values, other)
 
         dtype = self.dtype
+        values = self.values
+
+        if is_bool(other) or is_bool_dtype(other):
+
+            # bools force casting
+            values = values.astype(object)
+            dtype = None
+
+        values = np.where(cond, values, other)
+
         if self._is_numeric_dtype and np.any(isnull(values)):
             # We can't coerce to the numeric dtype of "self" (unless
             # it's float) if there are NaN values in our output.

diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
@@ -2,9 +2,14 @@
 from pandas._libs import (index as libindex,
                           algos as libalgos, join as libjoin)
 from pandas.core.dtypes.common import (
-    is_dtype_equal, pandas_dtype,
-    is_float_dtype, is_object_dtype,
-    is_integer_dtype, is_scalar)
+    is_dtype_equal,
+    pandas_dtype,
+    is_float_dtype,
+    is_object_dtype,
+    is_integer_dtype,
+    is_bool,
+    is_bool_dtype,
+    is_scalar)
 from pandas.core.common import _asarray_tuplesafe, _values_from_object
 
 from pandas import compat
@@ -56,6 +61,16 @@ def _maybe_cast_slice_bound(self, label, side, kind):
         # we will try to coerce to integers
         return self._maybe_cast_indexer(label)
 
+    def _convert_for_op(self, value):
+        """ Convert value to be insertable to ndarray """
+
+        if is_bool(value) or is_bool_dtype(value):
+            # force conversion to object
+            # so we don't lose the bools
+            raise TypeError
+
+        return value
+
     def _convert_tolerance(self, tolerance):
         try:
             return float(tolerance)