Merge pull request #10569 from jreback/comp

ERR: Boolean comparisons of a Series vs None will now be equivalent to null comparisons
pandas-dev · Jul 17, 2015 · 0de48d0 · 0de48d0
2 parents 5b97367 + effb676
commit 0de48d0
Show file tree

Hide file tree

Showing 15 changed files with 363 additions and 233 deletions.
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -34,6 +34,7 @@ New features
 
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
+
 - Enable `read_hdf` to be used without specifying a key when the HDF file contains a single dataset (:issue:`10443`)
 
 - ``DatetimeIndex`` can be instantiated using strings contains ``NaT`` (:issue:`7599`)
@@ -91,7 +92,7 @@ Backwards incompatible API changes
 Changes to convert_objects
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- ``DataFrame.convert_objects`` keyword arguments have been shortened. (:issue:`10265`)
+``DataFrame.convert_objects`` keyword arguments have been shortened. (:issue:`10265`)
 
   =====================   =============
   Old                     New
@@ -101,70 +102,65 @@ Changes to convert_objects
   ``convert_timedelta``   ``timedelta``
   =====================   =============
 
-- Coercing types with ``DataFrame.convert_objects`` is now implemented using the
-  keyword argument ``coerce=True``.  Previously types were coerced by setting a
-  keyword argument to ``'coerce'`` instead of ``True``, as in ``convert_dates='coerce'``.
-
-  .. ipython:: python
-
-    df = pd.DataFrame({'i': ['1','2'],
-                       'f': ['apple', '4.2'],
-                       's': ['apple','banana']})
-    df
+Coercing types with ``DataFrame.convert_objects`` is now implemented using the
+keyword argument ``coerce=True``.  Previously types were coerced by setting a
+keyword argument to ``'coerce'`` instead of ``True``, as in ``convert_dates='coerce'``.
 
-  The old usage of ``DataFrame.convert_objects`` used `'coerce'` along with the
-  type.
+.. ipython:: python
 
-  .. code-block:: python
+   df = pd.DataFrame({'i': ['1','2'],
+                      'f': ['apple', '4.2'],
+                      's': ['apple','banana']})
+   df
 
-    In [2]: df.convert_objects(convert_numeric='coerce')
+The old usage of ``DataFrame.convert_objects`` used `'coerce'` along with the
+type.
 
-  Now the ``coerce`` keyword must be explicitly used.
+.. code-block:: python
 
-  .. ipython:: python
+   In [2]: df.convert_objects(convert_numeric='coerce')
 
-    df.convert_objects(numeric=True, coerce=True)
+Now the ``coerce`` keyword must be explicitly used.
 
-- In earlier versions of pandas, ``DataFrame.convert_objects`` would not coerce
-  numeric types when there were no values convertible to a numeric type.  For example,
+.. ipython:: python
 
-  .. code-block:: python
+   df.convert_objects(numeric=True, coerce=True)
 
-    In [1]: df = pd.DataFrame({'s': ['a','b']})
-    In [2]: df.convert_objects(convert_numeric='coerce')
-    Out[2]:
-        s
-     0  a
-     1  b
+In earlier versions of pandas, ``DataFrame.convert_objects`` would not coerce
+numeric types when there were no values convertible to a numeric type. This returns
+the original DataFrame with no conversion. This change alters
+this behavior so that converts all non-number-like strings to ``NaN``.
 
-  returns the original DataFrame with no conversion. This change alters
-  this behavior so that
+.. code-block:: python
 
-  .. ipython:: python
+   In [1]: df = pd.DataFrame({'s': ['a','b']})
+   In [2]: df.convert_objects(convert_numeric='coerce')
+   Out[2]:
+          s
+       0  a
+       1  b
 
-    pd.DataFrame({'s': ['a','b']})
-    df.convert_objects(numeric=True, coerce=True)
+.. ipython:: python
 
-  converts all non-number-like strings to ``NaN``.
+   pd.DataFrame({'s': ['a','b']})
+   df.convert_objects(numeric=True, coerce=True)
 
-- In earlier versions of pandas, the default behavior was to try and convert
-  datetimes and timestamps. The new default is for ``DataFrame.convert_objects``
-  to do nothing, and so it is necessary to pass at least one conversion target
-  in the method call.
+In earlier versions of pandas, the default behavior was to try and convert
+datetimes and timestamps. The new default is for ``DataFrame.convert_objects``
+to do nothing, and so it is necessary to pass at least one conversion target
+in the method call.
 
-.. _whatsnew_0170.api_breaking.other:
+Changes to Index Comparisons
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Other API Changes
-^^^^^^^^^^^^^^^^^
+Operator equal on Index should behavior similarly to Series (:issue:`9947`)
 
-- Operator equal on Index should behavior similarly to Series (:issue:`9947`)
+Starting in v0.17.0, comparing ``Index`` objects of different lengths will raise
+a ``ValueError``. This is to be consistent with the behavior of ``Series``.
 
-  Starting in v0.17.0, comparing ``Index`` objects of different lengths will raise
-  a ``ValueError``. This is to be consistent with the behavior of ``Series``.
+Previous behavior:
 
-  Previous behavior:
-
-  .. code-block:: python
+.. code-block:: python
 
    In [2]: pd.Index([1, 2, 3]) == pd.Index([1, 4, 5])
    Out[2]: array([ True, False, False], dtype=bool)
@@ -188,9 +184,9 @@ Other API Changes
    In [7]: pd.Series([1, 2, 3]) == pd.Series([1, 2])
    ValueError: Series lengths must match to compare
 
-  New behavior:
+New behavior:
 
-  .. code-block:: python
+.. code-block:: python
 
    In [8]: pd.Index([1, 2, 3]) == pd.Index([1, 4, 5])
    Out[8]: array([ True, False, False], dtype=bool)
@@ -214,25 +210,27 @@ Other API Changes
    In [13]: pd.Series([1, 2, 3]) == pd.Series([1, 2])
    ValueError: Series lengths must match to compare
 
-  Note that this is different from the ``numpy`` behavior where a comparison can
-  be broadcast:
+Note that this is different from the ``numpy`` behavior where a comparison can
+be broadcast:
 
-  .. ipython:: python
+.. ipython:: python
 
    np.array([1, 2, 3]) == np.array([1])
 
-  or it can return False if broadcasting can not be done:
+or it can return False if broadcasting can not be done:
 
-  .. ipython:: python
+.. ipython:: python
 
    np.array([1, 2, 3]) == np.array([1, 2])
 
+Other API Changes
+^^^^^^^^^^^^^^^^^
+
 - Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
 - Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
 - Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
 - Serialize metadata properties of subclasses of pandas objects (:issue:`10553`).
 
-
 .. _whatsnew_0170.deprecations:
 
 Deprecations
@@ -243,6 +241,8 @@ Deprecations
 Removal of prior version deprecations/changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Remove use of some deprecated numpy comparison operations, mainly in tests. (:issue:`10569`)
+
 .. _dask: https://dask.readthedocs.org/en/latest/
 
 .. _whatsnew_0170.gil:
@@ -285,48 +285,51 @@ Performance Improvements
 Bug Fixes
 ~~~~~~~~~
 
+- Boolean comparisons of a ``Series`` vs ``None`` will now be equivalent to comparing with ``np.nan``, rather than raise ``TypeError``, xref (:issue:`1079`).
 - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
 - Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)
-
 - Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`)
-
-
 - Bug in ``pd.Series`` when setting a value on an empty ``Series`` whose index has a frequency. (:issue:`10193`)
-
 - Bug in ``DataFrame.plot`` raises ``ValueError`` when color name is specified by multiple characters (:issue:`10387`)
 - Bug in ``DataFrame.reset_index`` when index contains `NaT`. (:issue:`10388`)
+- Bug in ``ExcelReader`` when worksheet is empty (:issue:`6403`)
+- Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`)
+- Bug in ``offsets.generate_range`` where ``start`` and ``end`` have finer precision than ``offset`` (:issue:`9907`)
 
 
-- Bug in ``ExcelReader`` when worksheet is empty (:issue:`6403`)
 
 
-- Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`)
-- Bug in ``offsets.generate_range`` where ``start`` and ``end`` have finer precision than ``offset`` (:issue:`9907`)
 
 
 - Bug in ``DataFrame.interpolate`` with ``axis=1`` and ``inplace=True`` (:issue:`10395`)
-
 - Bug in ``io.sql.get_schema`` when specifying multiple columns as primary
   key (:issue:`10385`).
-
-
 - Bug in ``test_categorical`` on big-endian builds (:issue:`10425`)
 - Bug in ``Series.map`` using categorical ``Series`` raises ``AttributeError`` (:issue:`10324`)
 - Bug in ``MultiIndex.get_level_values`` including ``Categorical`` raises ``AttributeError`` (:issue:`10460`)
 
+
+
+
+
+
+
 - Bug that caused segfault when resampling an empty Series (:issue:`10228`)
 - Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`)
-
 - Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`)
-
 - Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`)
-
 - Bug in `pandas.read_csv` with ``index_col=False`` or with ``index_col=['a', 'b']``  (:issue:`10413`, :issue:`10467`)
-
 - Bug in `Series.from_csv` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`)
-
 - Bug in `groupby.var` which caused variance to be inaccurate for small float values (:issue:`10448`)
-
 - Bug in ``Series.plot(kind='hist')`` Y Label not informative (:issue:`10485`)
 
+
+
+
+
+
+
+
+
+
 - Bug in operator equal on Index not being consistent with Series (:issue:`9947`)
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -462,6 +462,10 @@ def array_equivalent(left, right, strict_nan=False):
     if issubclass(left.dtype.type, (np.floating, np.complexfloating)):
         return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
 
+    # numpy will will not allow this type of datetimelike vs integer comparison
+    elif is_datetimelike_v_numeric(left, right):
+        return False
+
     # NaNs cannot occur otherwise.
     return np.array_equal(left, right)
 
@@ -2539,6 +2543,26 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype):
     return issubclass(tipo, (np.datetime64, np.timedelta64))
 
 
+def is_datetimelike_v_numeric(a, b):
+    # return if we have an i8 convertible and numeric comparision
+    if not hasattr(a,'dtype'):
+        a = np.asarray(a)
+    if not hasattr(b, 'dtype'):
+        b = np.asarray(b)
+    f = lambda x: is_integer_dtype(x) or is_float_dtype(x)
+    return (needs_i8_conversion(a) and f(b)) or (
+        needs_i8_conversion(b) and f(a))
+
+def is_datetimelike_v_object(a, b):
+    # return if we have an i8 convertible and object comparision
+    if not hasattr(a,'dtype'):
+        a = np.asarray(a)
+    if not hasattr(b, 'dtype'):
+        b = np.asarray(b)
+    f = lambda x: is_object_dtype(x)
+    return (needs_i8_conversion(a) and f(b)) or (
+        needs_i8_conversion(b) and f(a))
+
 needs_i8_conversion = is_datetime_or_timedelta_dtype
 
 def i8_boxer(arr_or_dtype):

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3574,7 +3574,14 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
                 except ValueError:
                     new_other = np.array(other)
 
-                matches = (new_other == np.array(other))
+                # we can end up comparing integers and m8[ns]
+                # which is a numpy no no
+                is_i8 = com.needs_i8_conversion(self.dtype)
+                if is_i8:
+                    matches = False
+                else:
+                    matches = (new_other == np.array(other))
+
                 if matches is False or not matches.all():
 
                     # coerce other to a common dtype if we can

diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -164,18 +164,18 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
         elif data is None or np.isscalar(data):
             cls._scalar_data_error(data)
         else:
-            if tupleize_cols and isinstance(data, list) and data:
+            if tupleize_cols and isinstance(data, list) and data and isinstance(data[0], tuple):
                 try:
-                    sorted(data)
-                    has_mixed_types = False
-                except (TypeError, UnicodeDecodeError):
-                    has_mixed_types = True  # python3 only
-                if isinstance(data[0], tuple) and not has_mixed_types:
-                    try:
-                        return MultiIndex.from_tuples(
-                            data, names=name or kwargs.get('names'))
-                    except (TypeError, KeyError):
-                        pass  # python2 - MultiIndex fails on mixed types
+
+                    # must be orderable in py3
+                    if compat.PY3:
+                        sorted(data)
+                    return MultiIndex.from_tuples(
+                        data, names=name or kwargs.get('names'))
+                except (TypeError, KeyError):
+                    # python2 - MultiIndex fails on mixed types
+                    pass
+
             # other iterable of some kind
             subarr = com._asarray_tuplesafe(data, dtype=object)
 

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -14,7 +14,7 @@
                                 is_null_datelike_scalar, _maybe_promote,
                                 is_timedelta64_dtype, is_datetime64_dtype,
                                 array_equivalent, _maybe_convert_string_to_object,
-                                is_categorical)
+                                is_categorical, needs_i8_conversion, is_datetimelike_v_numeric)
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import maybe_convert_indices, length_of_indexer
 from pandas.core.categorical import Categorical, maybe_to_categorical
@@ -3885,9 +3885,16 @@ def _vstack(to_stack, dtype):
 
 
 def _possibly_compare(a, b, op):
-    res = op(a, b)
+
     is_a_array = isinstance(a, np.ndarray)
     is_b_array = isinstance(b, np.ndarray)
+
+    # numpy deprecation warning to have i8 vs integer comparisions
+    if is_datetimelike_v_numeric(a, b):
+        res = False
+    else:
+        res = op(a, b)
+
     if np.isscalar(res) and (is_a_array or is_b_array):
         type_names = [type(a).__name__, type(b).__name__]