Merge branch 'master' into period_immutable

pandas-dev · Aug 19, 2017 · 27df31a · 27df31a
2 parents b86a7b9 + 4e9c0d1
commit 27df31a
Show file tree

Hide file tree

Showing 74 changed files with 2,310 additions and 1,935 deletions.
diff --git a/appveyor.yml b/appveyor.yml
@@ -72,7 +72,7 @@ install:
   - cmd: conda info -a
 
   # create our env
-  - cmd: conda create -n pandas python=%PYTHON_VERSION% cython pytest pytest-xdist
+  - cmd: conda create -n pandas python=%PYTHON_VERSION% cython pytest>=3.1.0 pytest-xdist
   - cmd: activate pandas
   - SET REQ=ci\requirements-%PYTHON_VERSION%_WIN.run
   - cmd: echo "installing requirements from %REQ%"

diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -510,3 +510,17 @@ def time_begin_incr_rng(self):
 
     def time_begin_decr_rng(self):
         self.rng - self.semi_month_begin
+
+
+class DatetimeAccessor(object):
+    def setup(self):
+        self.N = 100000
+        self.series = pd.Series(
+            pd.date_range(start='1/1/2000', periods=self.N, freq='T')
+        )
+
+    def time_dt_accessor(self):
+        self.series.dt
+
+    def time_dt_accessor_normalize(self):
+        self.series.dt.normalize()
diff --git a/ci/install_circle.sh b/ci/install_circle.sh
@@ -64,7 +64,7 @@ fi
 # create envbuild deps
 echo "[create env: ${REQ_BUILD}]"
 time conda create -n pandas -q --file=${REQ_BUILD} || exit 1
-time conda install -n pandas pytest || exit 1
+time conda install -n pandas pytest>=3.1.0 || exit 1
 
 source activate pandas
 

diff --git a/ci/install_travis.sh b/ci/install_travis.sh
@@ -103,7 +103,7 @@ if [ -e ${REQ} ]; then
     time bash $REQ || exit 1
 fi
 
-time conda install -n pandas pytest
+time conda install -n pandas pytest>=3.1.0
 time pip install pytest-xdist
 
 if [ "$LINT" ]; then

diff --git a/ci/requirements_all.txt b/ci/requirements_all.txt
@@ -1,4 +1,4 @@
-pytest
+pytest>=3.1.0
 pytest-cov
 pytest-xdist
 flake8

diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt
@@ -2,6 +2,6 @@ python-dateutil
 pytz
 numpy
 cython
-pytest
+pytest>=3.1.0
 pytest-cov
 flake8
diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst
@@ -598,6 +598,10 @@ Like many packages, *pandas* uses `pytest
 extensions in `numpy.testing
 <http://docs.scipy.org/doc/numpy/reference/routines.testing.html>`_.
 
+.. note::
+
+   The earliest supported pytest version is 3.1.0.
+
 Writing tests
 ~~~~~~~~~~~~~
 
@@ -654,7 +658,9 @@ Using ``pytest``
 Here is an example of a self-contained set of tests that illustrate multiple features that we like to use.
 
 - functional style: tests are like ``test_*`` and *only* take arguments that are either fixtures or parameters
+- ``pytest.mark`` can be used to set metadata on test functions, e.g. ``skip`` or ``xfail``.
 - using ``parametrize``: allow testing of multiple cases
+- to set a mark on a parameter, ``pytest.param(..., marks=...)`` syntax should be used
 - ``fixture``, code for object construction, on a per-test basis
 - using bare ``assert`` for scalars and truth-testing
 - ``tm.assert_series_equal`` (and its counter part ``tm.assert_frame_equal``), for pandas object comparisons.
@@ -673,6 +679,13 @@ We would name this file ``test_cool_feature.py`` and put in an appropriate place
    def test_dtypes(dtype):
        assert str(np.dtype(dtype)) == dtype
 
+   @pytest.mark.parametrize('dtype', ['float32',
+       pytest.param('int16', marks=pytest.mark.skip),
+       pytest.param('int32',
+                    marks=pytest.mark.xfail(reason='to show how it works'))])
+   def test_mark(dtype):
+       assert str(np.dtype(dtype)) == 'float32'
+
    @pytest.fixture
    def series():
        return pd.Series([1, 2, 3])
@@ -695,13 +708,16 @@ A test run of this yields
 
    ((pandas) bash-3.2$ pytest  test_cool_feature.py  -v
    =========================== test session starts ===========================
-   platform darwin -- Python 3.5.2, pytest-3.0.5, py-1.4.31, pluggy-0.4.0
-   collected 8 items
+   platform darwin -- Python 3.6.2, pytest-3.2.1, py-1.4.31, pluggy-0.4.0
+   collected 11 items
 
    tester.py::test_dtypes[int8] PASSED
    tester.py::test_dtypes[int16] PASSED
    tester.py::test_dtypes[int32] PASSED
    tester.py::test_dtypes[int64] PASSED
+   tester.py::test_mark[float32] PASSED
+   tester.py::test_mark[int16] SKIPPED
+   tester.py::test_mark[int32] xfail
    tester.py::test_series[int8] PASSED
    tester.py::test_series[int16] PASSED
    tester.py::test_series[int32] PASSED
@@ -714,8 +730,8 @@ Tests that we have ``parametrized`` are now accessible via the test name, for ex
 
    ((pandas) bash-3.2$ pytest  test_cool_feature.py  -v -k int8
    =========================== test session starts ===========================
-   platform darwin -- Python 3.5.2, pytest-3.0.5, py-1.4.31, pluggy-0.4.0
-   collected 8 items
+   platform darwin -- Python 3.6.2, pytest-3.2.1, py-1.4.31, pluggy-0.4.0
+   collected 11 items
 
    test_cool_feature.py::test_dtypes[int8] PASSED
    test_cool_feature.py::test_series[int8] PASSED

diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2020,6 +2020,13 @@ into a flat table.
 .. ipython:: python
 
    from pandas.io.json import json_normalize
+   data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
+           {'name': {'given': 'Mose', 'family': 'Regner'}},
+           {'id': 2, 'name': 'Faye Raker'}]
+   json_normalize(data)
+
+.. ipython:: python
+
    data = [{'state': 'Florida',
              'shortname': 'FL',
              'info': {

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -126,6 +126,8 @@ Other Enhancements
 - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
 - Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
 - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`)
+- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`).
+- :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`).
 
 .. _whatsnew_0210.api_breaking:
 
@@ -275,7 +277,7 @@ Other API Changes
 - Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`)
 - Moved definition of ``MergeError`` to the ``pandas.errors`` module.
 - The signature of :func:`Series.set_axis` and :func:`DataFrame.set_axis` has been changed from ``set_axis(axis, labels)`` to ``set_axis(labels, axis=0)``, for consistency with the rest of the API. The old signature is deprecated and will show a ``FutureWarning`` (:issue:`14636`)
-
+- :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`)
 
 .. _whatsnew_0210.deprecations:
 
@@ -306,6 +308,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`)
+- :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`)
 
 
 .. _whatsnew_0210.bug_fixes:
@@ -317,8 +320,11 @@ Bug Fixes
 Conversion
 ^^^^^^^^^^
 
-- Bug in assignment against datetime-like data with ``int`` may incorrectly converte to datetime-like (:issue:`14145`)
+- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
 - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
+- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`)
+- Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods.  Previously returned a ``numpy.bool_``. (:issue:`17237`)
+- Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`)
 
 
 Indexing
@@ -379,6 +385,7 @@ Reshaping
 - Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`)
 - Fixes dtype of result with integer dtype input, from :func:`pivot_table` when called with ``margins=True`` (:issue:`17013`)
 - Bug in :func:`crosstab` where passing two ``Series`` with the same name raised a ``KeyError`` (:issue:`13279`)
+- :func:`Series.argmin`, :func:`Series.argmax`, and their counterparts on ``DataFrame`` and groupby objects work correctly with floating point data that contains infinite values (:issue:`13595`).
 
 Numeric
 ^^^^^^^
@@ -388,6 +395,9 @@ Numeric
 Categorical
 ^^^^^^^^^^^
 - Bug in :func:`Series.isin` when called with a categorical (:issue`16639`)
+- Bug in the categorical constructor with empty values and categories causing
+  the ``.categories`` to be an empty ``Float64Index`` rather than an empty
+  ``Index`` with object dtype (:issue:`17248`)
 
 
 Other

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -2,7 +2,27 @@
 
 from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
 
-from khash cimport *
+from khash cimport (
+    khiter_t,
+
+    kh_str_t, kh_init_str, kh_put_str, kh_exist_str,
+    kh_get_str, kh_destroy_str, kh_resize_str,
+
+    kh_put_strbox, kh_get_strbox, kh_init_strbox,
+
+    kh_int64_t, kh_init_int64, kh_resize_int64, kh_destroy_int64,
+    kh_get_int64, kh_exist_int64, kh_put_int64,
+
+    kh_float64_t, kh_exist_float64, kh_put_float64, kh_init_float64,
+    kh_get_float64, kh_destroy_float64, kh_resize_float64,
+
+    kh_resize_uint64, kh_exist_uint64, kh_destroy_uint64, kh_put_uint64,
+    kh_get_uint64, kh_init_uint64,
+
+    kh_destroy_pymap, kh_exist_pymap, kh_init_pymap, kh_get_pymap,
+    kh_put_pymap, kh_resize_pymap)
+
+
 from numpy cimport *
 
 from libc.stdlib cimport malloc, free

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -1,8 +1,6 @@
 # cython: profile=False
 
-from numpy cimport ndarray
-
-from numpy cimport (float64_t, int32_t, int64_t, uint8_t,
+from numpy cimport (ndarray, float64_t, int32_t, int64_t, uint8_t, uint64_t,
                     NPY_DATETIME, NPY_TIMEDELTA)
 cimport cython
 
@@ -16,7 +14,9 @@ cimport util
 import numpy as np
 
 cimport tslib
-from hashtable cimport *
+
+from hashtable cimport HashTable
+
 from pandas._libs import tslib, algos, hashtable as _hash
 from pandas._libs.tslib import Timestamp, Timedelta
 from datetime import datetime, timedelta
@@ -32,13 +32,9 @@ cdef extern from "datetime.h":
 
 cdef int64_t iNaT = util.get_nat()
 
-try:
-    from dateutil.tz import tzutc as _du_utc
-    import pytz
-    UTC = pytz.utc
-    have_pytz = True
-except ImportError:
-    have_pytz = False
+from dateutil.tz import tzutc as _du_utc
+import pytz
+UTC = pytz.utc
 
 PyDateTime_IMPORT
 

diff --git a/pandas/_libs/join_func_helper.pxi.in b/pandas/_libs/join_func_helper.pxi.in
@@ -9,6 +9,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # asof_join_by
 #----------------------------------------------------------------------
 
+from hashtable cimport PyObjectHashTable, UInt64HashTable, Int64HashTable
+
 {{py:
 
 # table_type, by_dtype
@@ -23,7 +25,6 @@ on_dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',
 }}
 
 
-from hashtable cimport *
 
 {{for table_type, by_dtype in by_dtypes}}
 {{for on_dtype in on_dtypes}}

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -10,21 +10,14 @@ from numpy cimport *
 
 np.import_array()
 
-cdef extern from "numpy/arrayobject.h":
-    cdef enum NPY_TYPES:
-        NPY_intp "NPY_INTP"
-
 from libc.stdlib cimport malloc, free
 
-from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
-                      PyDict_Contains, PyDict_Keys,
-                      Py_INCREF, PyTuple_SET_ITEM,
+from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
                       PyList_Check, PyFloat_Check,
                       PyString_Check,
                       PyBytes_Check,
-                      PyTuple_SetItem,
+                      PyUnicode_Check,
                       PyTuple_New,
-                      PyObject_SetAttrString,
                       PyObject_RichCompareBool,
                       PyBytes_GET_SIZE,
                       PyUnicode_GET_SIZE,
@@ -55,7 +48,18 @@ cdef double NAN = nan
 from datetime import datetime as pydatetime
 
 # this is our tseries.pxd
-from datetime cimport *
+from datetime cimport (
+    get_timedelta64_value, get_datetime64_value,
+    npy_timedelta, npy_datetime,
+    PyDateTime_Check, PyDate_Check, PyTime_Check, PyDelta_Check,
+    PyDateTime_GET_YEAR,
+    PyDateTime_GET_MONTH,
+    PyDateTime_GET_DAY,
+    PyDateTime_DATE_GET_HOUR,
+    PyDateTime_DATE_GET_MINUTE,
+    PyDateTime_DATE_GET_SECOND,
+    PyDateTime_IMPORT)
+
 
 from tslib cimport (convert_to_tsobject, convert_to_timedelta64,
                     _check_all_nulls)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -32,7 +32,7 @@ cdef extern from "stdlib.h":
 cimport cython
 cimport numpy as cnp
 
-from numpy cimport ndarray, uint8_t, uint64_t
+from numpy cimport ndarray, uint8_t, uint64_t, int64_t
 
 import numpy as np
 cimport util
@@ -57,7 +57,14 @@ import os
 
 cnp.import_array()
 
-from khash cimport *
+from khash cimport (
+    khiter_t,
+    kh_str_t, kh_init_str, kh_put_str, kh_exist_str,
+    kh_get_str, kh_destroy_str,
+    kh_float64_t, kh_get_float64, kh_destroy_float64,
+    kh_put_float64, kh_init_float64,
+    kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox,
+    kh_destroy_strbox)
 
 import sys
 

diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx
@@ -2,6 +2,7 @@ from datetime import datetime, date, timedelta
 import operator
 
 from cpython cimport (
+    PyUnicode_Check,
     PyObject_RichCompareBool,
     Py_EQ, Py_NE)
 
@@ -18,21 +19,29 @@ from pandas import compat
 from pandas.compat import PY2
 
 cimport cython
-from datetime cimport *
+
+from datetime cimport (
+    is_leapyear,
+    PyDateTime_IMPORT,
+    pandas_datetimestruct,
+    pandas_datetimestruct_to_datetime,
+    pandas_datetime_to_datetimestruct,
+    PANDAS_FR_ns,
+    INT32_MIN)
+
 
 cimport util, lib
 
 from lib cimport is_null_datetimelike, is_period
 from pandas._libs import tslib, lib
 from pandas._libs.tslib import (Timedelta, Timestamp, iNaT,
-                                NaT, have_pytz, _get_utcoffset)
+                                NaT, _get_utcoffset)
 from tslib cimport (
     maybe_get_tz,
     _is_utc,
     _is_tzlocal,
     _get_dst_info,
-    _nat_scalar_rules,
-)
+    _nat_scalar_rules)
 
 from pandas.tseries import offsets
 from pandas.core.tools.datetimes import parse_time_string
@@ -611,9 +620,6 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps,
         ndarray[int64_t] trans, deltas, pos
         pandas_datetimestruct dts
 
-    if not have_pytz:
-        raise Exception('Could not find pytz module')
-
     if _is_utc(tz):
         for i in range(n):
             if stamps[i] == NPY_NAT:
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,6 @@ python-dateutil @@
     pytz
     numpy
     cython
-    pytest
+    pytest>=3.1.0
     pytest-cov
     flake8