diff --git a/ci/code_checks.sh b/ci/code_checks.sh index eba96f0c6c2fc..f0772f72d63d4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -9,16 +9,19 @@ # In the future we may want to add the validation of docstrings and other checks here. # # Usage: -# $ ./ci/code_checks.sh # run all checks -# $ ./ci/code_checks.sh lint # run linting only -# $ ./ci/code_checks.sh patterns # check for patterns that should not exist -# $ ./ci/code_checks.sh doctests # run doctests +# $ ./ci/code_checks.sh # run all checks +# $ ./ci/code_checks.sh lint # run linting only +# $ ./ci/code_checks.sh patterns # check for patterns that should not exist +# $ ./ci/code_checks.sh doctests # run doctests +# $ ./ci/code_checks.sh dependencies # check that dependencies are consistent echo "inside $0" [[ $LINT ]] || { echo "NOT Linting. To lint use: LINT=true $0 $1"; exit 0; } -[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "doctests" ]] || { echo "Unknown command $1. Usage: $0 [lint|patterns|doctests]"; exit 9999; } +[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "doctests" || "$1" == "dependencies" ]] \ + || { echo "Unknown command $1. Usage: $0 [lint|patterns|doctests|dependencies]"; exit 9999; } source activate pandas +BASE_DIR="$(dirname $0)/.." RET=0 CHECK=$1 @@ -119,6 +122,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then ! grep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. (autosummary|contents|currentmodule|deprecated|function|image|important|include|ipython|literalinclude|math|module|note|raw|seealso|toctree|versionadded|versionchanged|warning):[^:]" ./pandas ./doc/source RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check that the deprecated `assert_raises_regex` is not used (`pytest.raises(match=pattern)` should be used instead)' ; echo $MSG + ! grep -R --exclude=*.pyc --exclude=testing.py --exclude=test_testing.py assert_raises_regex pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for modules that pandas should not import' ; echo $MSG python -c " import sys @@ -172,4 +179,11 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then fi +### DEPENDENCIES ### +if [[ -z "$CHECK" || "$CHECK" == "dependencies" ]]; then + MSG='Check that requirements-dev.txt has been generated from environment.yml' ; echo $MSG + $BASE_DIR/scripts/generate_pip_deps_from_conda.py --compare + RET=$(($RET + $?)) ; echo $MSG "DONE" +fi + exit $RET diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml deleted file mode 100644 index 2718c1cd582b6..0000000000000 --- a/ci/environment-dev.yaml +++ /dev/null @@ -1,20 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - Cython>=0.28.2 - - NumPy - - flake8 - - flake8-comprehensions - - flake8-rst - - hypothesis>=3.58.0 - - isort - - moto - - pytest>=3.6 - - python-dateutil>=2.5.0 - - python=3 - - pytz - - setuptools>=24.2.0 - - sphinx - - sphinxcontrib-spelling diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt deleted file mode 100644 index 8758c8154abca..0000000000000 --- a/ci/requirements-optional-conda.txt +++ /dev/null @@ -1,28 +0,0 @@ -beautifulsoup4>=4.2.1 -blosc -bottleneck>=1.2.0 -fastparquet>=0.1.2 -gcsfs -html5lib -ipython>=5.6.0 -ipykernel -jinja2 -lxml -matplotlib>=2.0.0 -nbsphinx -numexpr>=2.6.1 -openpyxl -pyarrow>=0.7.0 -pymysql -pytables>=3.4.2 -pytest-cov -pytest-xdist -s3fs -scipy>=0.18.1 -seaborn -sqlalchemy -statsmodels -xarray -xlrd -xlsxwriter -xlwt diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt deleted file mode 100644 index a1cb20c265974..0000000000000 --- a/ci/requirements_dev.txt +++ /dev/null @@ -1,16 +0,0 @@ -# This file was autogenerated by scripts/convert_deps.py -# Do not modify directly -Cython>=0.28.2 -NumPy -flake8 -flake8-comprehensions -flake8-rst -hypothesis>=3.58.0 -isort -moto -pytest>=3.6 -python-dateutil>=2.5.0 -pytz -setuptools>=24.2.0 -sphinx -sphinxcontrib-spelling \ No newline at end of file diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 084f710091a1b..514a58456bcd9 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -170,7 +170,7 @@ We'll now kick off a three-step process: .. code-block:: none # Create and activate the build environment - conda env create -f ci/environment-dev.yaml + conda env create -f environment.yml conda activate pandas-dev # or with older versions of Anaconda: @@ -180,9 +180,6 @@ We'll now kick off a three-step process: python setup.py build_ext --inplace -j 4 python -m pip install -e . - # Install the rest of the optional dependencies - conda install -c defaults -c conda-forge --file=ci/requirements-optional-conda.txt - At this point you should be able to import pandas from your locally built version:: $ python # start an interpreter @@ -221,14 +218,12 @@ You'll need to have at least python3.5 installed on your system. . ~/virtualenvs/pandas-dev/bin/activate # Install the build dependencies - python -m pip install -r ci/requirements_dev.txt + python -m pip install -r requirements-dev.txt + # Build and install pandas python setup.py build_ext --inplace -j 4 python -m pip install -e . - # Install additional dependencies - python -m pip install -r ci/requirements-optional-pip.txt - Creating a branch ----------------- diff --git a/doc/source/io.rst b/doc/source/io.rst index 5d29e349e2898..beb1c1daba962 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2861,7 +2861,13 @@ to be parsed. read_excel('path_to_file.xls', 'Sheet1', usecols=2) -If `usecols` is a list of integers, then it is assumed to be the file column +You can also specify a comma-delimited set of Excel columns and ranges as a string: + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', usecols='A,C:E') + +If ``usecols`` is a list of integers, then it is assumed to be the file column indices to be parsed. .. code-block:: python @@ -2870,6 +2876,27 @@ indices to be parsed. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. +.. versionadded:: 0.24 + +If ``usecols`` is a list of strings, it is assumed that each string corresponds +to a column name provided either by the user in ``names`` or inferred from the +document header row(s). Those strings define which columns will be parsed: + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) + +Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``. + +.. versionadded:: 0.24 + +If ``usecols`` is callable, the callable function will be evaluated against +the column names, returning names where the callable function evaluates to ``True``. + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) + Parsing Dates +++++++++++++ diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 6163b6f2ae89a..ff867a2ddfe6d 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -17,6 +17,8 @@ Reshaping and Pivot Tables Reshaping by pivoting DataFrame objects --------------------------------------- +.. image:: _static/reshaping_pivot.png + .. ipython:: :suppress: @@ -33,8 +35,7 @@ Reshaping by pivoting DataFrame objects In [3]: df = unpivot(tm.makeTimeDataFrame()) -Data is often stored in CSV files or databases in so-called "stacked" or -"record" format: +Data is often stored in so-called "stacked" or "record" format: .. ipython:: python @@ -66,8 +67,6 @@ To select out everything for variable ``A`` we could do: df[df['variable'] == 'A'] -.. image:: _static/reshaping_pivot.png - But suppose we wish to do time series operations with the variables. A better representation would be where the ``columns`` are the unique variables and an ``index`` of dates identifies individual observations. To reshape the data into @@ -87,7 +86,7 @@ column: .. ipython:: python df['value2'] = df['value'] * 2 - pivoted = df.pivot('date', 'variable') + pivoted = df.pivot(index='date', columns='variable') pivoted You can then select subsets from the pivoted ``DataFrame``: @@ -99,6 +98,12 @@ You can then select subsets from the pivoted ``DataFrame``: Note that this returns a view on the underlying data in the case where the data are homogeneously-typed. +.. note:: + :func:`~pandas.pivot` will error with a ``ValueError: Index contains duplicate + entries, cannot reshape`` if the index/column pair is not unique. In this + case, consider using :func:`~pandas.pivot_table` which is a generalization + of pivot that can handle duplicate values for one index/column pair. + .. _reshaping.stacking: Reshaping by stacking and unstacking @@ -704,10 +709,103 @@ handling of NaN: In [3]: np.unique(x, return_inverse=True)[::-1] Out[3]: (array([3, 3, 0, 4, 1, 2]), array([nan, 3.14, inf, 'A', 'B'], dtype=object)) - .. note:: If you just want to handle one column as a categorical variable (like R's factor), you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or ``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. + +Examples +-------- + +In this section, we will review frequently asked questions and examples. The +column names and relevant column values are named to correspond with how this +DataFrame will be pivoted in the answers below. + +.. ipython:: python + + np.random.seed([3, 1415]) + n = 20 + + cols = np.array(['key', 'row', 'item', 'col']) + df = cols + pd.DataFrame((np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str)) + df.columns = cols + df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix('val')) + + df + +Pivoting with Single Aggregations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose we wanted to pivot ``df`` such that the ``col`` values are columns, +``row`` values are the index, and the mean of ``val0`` are the values? In +particular, the resulting DataFrame should look like: + +.. code-block:: ipython + + col col0 col1 col2 col3 col4 + row + row0 0.77 0.605 NaN 0.860 0.65 + row2 0.13 NaN 0.395 0.500 0.25 + row3 NaN 0.310 NaN 0.545 NaN + row4 NaN 0.100 0.395 0.760 0.24 + +This solution uses :func:`~pandas.pivot_table`. Also note that +``aggfunc='mean'`` is the default. It is included here to be explicit. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='mean') + +Note that we can also replace the missing values by using the ``fill_value`` +parameter. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='mean', fill_value=0) + +Also note that we can pass in other aggregation functions as well. For example, +we can also pass in ``sum``. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='sum', fill_value=0) + +Another aggregation we can do is calculate the frequency in which the columns +and rows occur together a.k.a. "cross tabulation". To do this, we can pass +``size`` to the ``aggfunc`` parameter. + +.. ipython:: python + + df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') + +Pivoting with Multiple Aggregations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We can also perform multiple aggregations. For example, to perform both a +``sum`` and ``mean``, we can pass in a list to the ``aggfunc`` argument. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc=['mean', 'sum']) + +Note to aggregate over multiple value columns, we can pass in a list to the +``values`` parameter. + +.. ipython:: python + + df.pivot_table( + values=['val0', 'val1'], index='row', columns='col', aggfunc=['mean']) + +Note to subdivide over multiple columns we can pass in a list to the +``columns`` parameter. + +.. ipython:: python + + df.pivot_table( + values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c6e23ff6e15a4..dd8b72431ba5a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -238,6 +238,7 @@ Other Enhancements - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) - :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`) - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`) +- :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`) .. _whatsnew_0240.api_breaking: @@ -246,6 +247,7 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) +- Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) .. _whatsnew_0240.api_breaking.deps: @@ -562,6 +564,7 @@ changes were made: - The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. - ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. - Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. +- ``DataFrame[column]`` is now a :class:`Series` with sparse values, rather than a :class:`SparseSeries`, when slicing a single column with sparse values (:issue:`23559`). Some new warnings are issued for operations that require or are likely to materialize a large dense array: @@ -969,6 +972,7 @@ Deprecations - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) - :meth:`ExtensionArray._formatting_values` is deprecated. Use `ExtensionArray._formatter` instead. (:issue:`23601`) +- Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`) .. _whatsnew_0240.deprecations.datetimelike_int_ops: @@ -1129,6 +1133,9 @@ Datetimelike - Bug in :class:`PeriodIndex` with attribute ``freq.n`` greater than 1 where adding a :class:`DateOffset` object would return incorrect results (:issue:`23215`) - Bug in :class:`Series` that interpreted string indices as lists of characters when setting datetimelike values (:issue:`23451`) - Bug in :class:`Timestamp` constructor which would drop the frequency of an input :class:`Timestamp` (:issue:`22311`) +- Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`) +- Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`) +- Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`) Timedelta ^^^^^^^^^ @@ -1175,6 +1182,7 @@ Offsets - Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) - Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) - Bug in adding :class:`DateOffset` with :class:`DataFrame` or :class:`PeriodIndex` incorrectly raising ``TypeError`` (:issue:`23215`) +- Bug in comparing :class:`DateOffset` objects with non-DateOffset objects, particularly strings, raising ``ValueError`` instead of returning ``False`` for equality checks and ``True`` for not-equal checks (:issue:`23524`) Numeric ^^^^^^^ @@ -1302,6 +1310,8 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) - Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) +- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`) +- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) Plotting ^^^^^^^^ diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000000..f66625e6a60c7 --- /dev/null +++ b/environment.yml @@ -0,0 +1,53 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + # required + - NumPy + - python=3 + - python-dateutil>=2.5.0 + - pytz + + # development + - Cython>=0.28.2 + - flake8 + - flake8-comprehensions + - flake8-rst + - hypothesis>=3.58.0 + - isort + - moto + - pytest>=3.6 + - setuptools>=24.2.0 + - sphinx + - sphinxcontrib-spelling + + # optional + - beautifulsoup4>=4.2.1 + - blosc + - bottleneck>=1.2.0 + - fastparquet>=0.1.2 + - gcsfs + - html5lib + - ipython>=5.6.0 + - ipykernel + - jinja2 + - lxml + - matplotlib>=2.0.0 + - nbsphinx + - numexpr>=2.6.1 + - openpyxl + - pyarrow>=0.7.0 + - pymysql + - pytables>=3.4.2 + - pytest-cov + - pytest-xdist + - s3fs + - scipy>=0.18.1 + - seaborn + - sqlalchemy + - statsmodels + - xarray + - xlrd + - xlsxwriter + - xlwt diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 0888cf3c85f2f..5df1e381ea3ce 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -1,9 +1,6 @@ from util cimport numeric -cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil - - cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: cdef: numeric t diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 075e2c5129579..e77899507833f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -15,8 +15,7 @@ from numpy cimport (ndarray, NPY_FLOAT32, NPY_FLOAT64, NPY_OBJECT, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t, - double_t) + uint32_t, uint64_t, float32_t, float64_t) cnp.import_array() @@ -32,10 +31,9 @@ import missing cdef float64_t FP_ERR = 1e-13 -cdef double NaN = np.NaN -cdef double nan = NaN +cdef float64_t NaN = np.NaN -cdef int64_t iNaT = get_nat() +cdef int64_t NPY_NAT = get_nat() tiebreakers = { 'average': TIEBREAK_AVERAGE, @@ -199,7 +197,7 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): @cython.boundscheck(False) @cython.wraparound(False) -cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil: +def kth_smallest(numeric[:] a, Py_ssize_t k) -> numeric: cdef: Py_ssize_t i, j, l, m, n = a.shape[0] numeric x @@ -812,7 +810,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): n = len(arr) if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + if arr[0] != arr[0] or (timelike and arr[0] == NPY_NAT): # single value is NaN return False, False, True else: @@ -820,7 +818,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): elif n < 2: return True, True, True - if timelike and arr[0] == iNaT: + if timelike and arr[0] == NPY_NAT: return False, False, True if algos_t is not object: @@ -828,7 +826,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): prev = arr[0] for i in range(1, n): cur = arr[i] - if timelike and cur == iNaT: + if timelike and cur == NPY_NAT: is_monotonic_inc = 0 is_monotonic_dec = 0 break @@ -853,7 +851,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): prev = arr[0] for i in range(1, n): cur = arr[i] - if timelike and cur == iNaT: + if timelike and cur == NPY_NAT: is_monotonic_inc = 0 is_monotonic_dec = 0 break diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index c2b0a4119e6e5..3708deb1a4b76 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -84,9 +84,9 @@ def put2d_{{name}}_{{dest_name}}(ndarray[{{c_type}}, ndim=2, cast=True] values, {{endfor}} -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # ensure_dtype -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- cdef int PLATFORM_INT = (np.arange(0, dtype=np.intp)).descr.type_num diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index fcb052e8be63b..4d144dcf2808a 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -74,9 +74,9 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', {{elif dtype == 'float64'}} mask = np.isnan(values) {{elif dtype == 'int64'}} - mask = values == iNaT + mask = values == NPY_NAT - # create copy in case of iNaT + # create copy in case of NPY_NAT # values are mutated inplace if mask.any(): values = values.copy() @@ -149,7 +149,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', {{if dtype != 'uint64'}} isnan = sorted_mask[i] if isnan and keep_na: - ranks[argsorted[i]] = nan + ranks[argsorted[i]] = NaN continue {{endif}} @@ -257,7 +257,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', {{elif dtype == 'float64'}} mask = np.isnan(values) {{elif dtype == 'int64'}} - mask = values == iNaT + mask = values == NPY_NAT {{endif}} np.putmask(values, mask, nan_value) @@ -317,7 +317,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', {{else}} if (val == nan_value) and keep_na: {{endif}} - ranks[i, argsorted[i, j]] = nan + ranks[i, argsorted[i, j]] = NaN {{if dtype == 'object'}} infs += 1 diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index bd5feef1ff2b0..2fea8b17fd9d7 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -4,9 +4,9 @@ Template for each `dtype` helper function for take WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # take_1d, take_2d -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 83ded64b742ed..7c16b29f3e42b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,14 +1,13 @@ # -*- coding: utf-8 -*- -cimport cython -from cython cimport Py_ssize_t +import cython +from cython import Py_ssize_t from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp from numpy cimport (ndarray, - double_t, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float32_t, float64_t) cnp.import_array() @@ -20,10 +19,9 @@ from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers -cdef int64_t iNaT = get_nat() +cdef int64_t NPY_NAT = get_nat() -cdef double NaN = np.NaN -cdef double nan = NaN +cdef float64_t NaN = np.NaN cdef inline float64_t median_linear(float64_t* a, int n) nogil: @@ -67,13 +65,13 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil: return result -# TODO: Is this redundant with algos.kth_smallest? +# TODO: Is this redundant with algos.kth_smallest cdef inline float64_t kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n) nogil: cdef: Py_ssize_t i, j, l, m - double_t x, t + float64_t x, t l = 0 m = n - 1 @@ -109,7 +107,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, cdef: Py_ssize_t i, j, N, K, ngroups, size ndarray[int64_t] _counts - ndarray data + ndarray[float64_t, ndim=2] data float64_t* ptr assert min_count == -1, "'min_count' only used in add and prod" @@ -139,8 +137,8 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cumprod_float64(float64_t[:, :] out, - float64_t[:, :] values, - int64_t[:] labels, + const float64_t[:, :] values, + const int64_t[:] labels, bint is_datetimelike, bint skipna=True): """ @@ -177,7 +175,7 @@ def group_cumprod_float64(float64_t[:, :] out, @cython.wraparound(False) def group_cumsum(numeric[:, :] out, numeric[:, :] values, - int64_t[:] labels, + const int64_t[:] labels, is_datetimelike, bint skipna=True): """ @@ -217,7 +215,7 @@ def group_cumsum(numeric[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, +def group_shift_indexer(int64_t[:] out, const int64_t[:] labels, int ngroups, int periods): cdef: Py_ssize_t N, i, j, ii @@ -291,7 +289,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, """ cdef: Py_ssize_t i, N - ndarray[int64_t] sorted_labels + int64_t[:] sorted_labels int64_t idx, curr_fill_idx=-1, filled_vals=0 N = len(out) @@ -327,10 +325,10 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.boundscheck(False) @cython.wraparound(False) -def group_any_all(ndarray[uint8_t] out, - ndarray[int64_t] labels, - ndarray[uint8_t] values, - ndarray[uint8_t] mask, +def group_any_all(uint8_t[:] out, + const int64_t[:] labels, + const uint8_t[:] values, + const uint8_t[:] mask, object val_test, bint skipna): """Aggregated boolean values to show truthfulness of group elements diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 484a4b069305f..523d43f893aad 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -5,7 +5,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ cdef extern from "numpy/npy_math.h": - double NAN "NPY_NAN" + float64_t NAN "NPY_NAN" _int64_max = np.iinfo(np.int64).max # ---------------------------------------------------------------------- @@ -268,16 +268,16 @@ def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out, {{endfor}} -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # group_nth, group_last, group_rank -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: # name, c_type, nan_val dtypes = [('float64', 'float64_t', 'NAN'), ('float32', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'iNaT'), + ('int64', 'int64_t', 'NPY_NAT'), ('object', 'object', 'NAN')] def get_dispatch(dtypes): @@ -527,7 +527,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # to the result where appropriate if keep_na and mask[_as[i]]: for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = nan + out[_as[j], 0] = NaN grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): @@ -630,7 +630,7 @@ def group_max(ndarray[groupby_t, ndim=2] out, if groupby_t is int64_t: # Note: evaluated at compile-time maxx[:] = -_int64_max - nan_val = iNaT + nan_val = NPY_NAT else: maxx[:] = -np.inf nan_val = NAN @@ -692,7 +692,7 @@ def group_min(ndarray[groupby_t, ndim=2] out, minx = np.empty_like(out) if groupby_t is int64_t: minx[:] = _int64_max - nan_val = iNaT + nan_val = NPY_NAT else: minx[:] = np.inf nan_val = NAN @@ -762,8 +762,8 @@ def group_cummin(ndarray[groupby_t, ndim=2] out, # val = nan if groupby_t is int64_t: - if is_datetimelike and val == iNaT: - out[i, j] = iNaT + if is_datetimelike and val == NPY_NAT: + out[i, j] = NPY_NAT else: mval = accum[lab, j] if val < mval: @@ -809,8 +809,8 @@ def group_cummax(ndarray[groupby_t, ndim=2] out, val = values[i, j] if groupby_t is int64_t: - if is_datetimelike and val == iNaT: - out[i, j] = iNaT + if is_datetimelike and val == NPY_NAT: + out[i, j] = NPY_NAT else: mval = accum[lab, j] if val > mval: diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index d38b72ccebbb2..9aa887727a765 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -9,11 +9,11 @@ from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint32_t +from numpy cimport ndarray, uint8_t, uint32_t, float64_t cnp.import_array() cdef extern from "numpy/npy_math.h": - double NAN "NPY_NAN" + float64_t NAN "NPY_NAN" from khash cimport ( @@ -42,9 +42,7 @@ cimport util from missing cimport checknull -nan = np.nan - -cdef int64_t iNaT = util.get_nat() +cdef int64_t NPY_NAT = util.get_nat() _SIZE_HINT_LIMIT = (1 << 20) + 7 diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 36ed8a88aa78b..a71023ed34f44 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -251,9 +251,9 @@ cdef class HashTable: {{py: # name, dtype, float_group, default_na_value -dtypes = [('Float64', 'float64', True, 'nan'), +dtypes = [('Float64', 'float64', True, 'np.nan'), ('UInt64', 'uint64', False, 0), - ('Int64', 'int64', False, 'iNaT')] + ('Int64', 'int64', False, 'NPY_NAT')] }} diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index d418ac63a4ac8..7930f583274b5 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -25,7 +25,7 @@ from pandas._libs import algos, hashtable as _hash from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib from pandas._libs.missing import checknull -cdef int64_t iNaT = util.get_nat() +cdef int64_t NPY_NAT = util.get_nat() cdef inline bint is_definitely_invalid_key(object val): @@ -520,7 +520,7 @@ cpdef convert_scalar(ndarray arr, object value): elif isinstance(value, (datetime, np.datetime64, date)): return Timestamp(value).value elif value is None or value != value: - return iNaT + return NPY_NAT elif util.is_string_object(value): return Timestamp(value).value raise ValueError("cannot set a Timestamp with a non-timestamp") @@ -531,7 +531,7 @@ cpdef convert_scalar(ndarray arr, object value): elif isinstance(value, timedelta): return Timedelta(value).value elif value is None or value != value: - return iNaT + return NPY_NAT elif util.is_string_object(value): return Timedelta(value).value raise ValueError("cannot set a Timedelta with a non-timedelta") diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index c19812efaaa35..ff95917f6643a 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -4,9 +4,9 @@ Template for functions of IndexEngine subclasses. WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # IndexEngine Subclass Methods -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index a395fdbabeca2..dae88d3b707bf 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,20 +1,27 @@ # -*- coding: utf-8 -*- import numbers +from operator import le, lt from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, PyObject_RichCompare) -cimport cython -from cython cimport Py_ssize_t +import cython +from cython import Py_ssize_t import numpy as np -from numpy cimport ndarray +cimport numpy as cnp +from numpy cimport ( + int64_t, int32_t, float64_t, float32_t, uint64_t, + ndarray, + PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take) +cnp.import_array() -from operator import le, lt cimport util util.import_array() +from hashtable cimport Int64Vector, Int64VectorData + from tslibs import Timestamp from tslibs.timezones cimport tz_compare diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index f9427fbbcd900..aa53f5086b894 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -4,21 +4,6 @@ Template for intervaltree WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -from numpy cimport ( - int64_t, int32_t, float64_t, float32_t, uint64_t, - ndarray, - PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take) -import numpy as np - -cimport cython -from cython cimport Py_ssize_t - -cimport numpy as cnp -cnp.import_array() - -from hashtable cimport Int64Vector, Int64VectorData - - ctypedef fused scalar_t: float64_t float32_t @@ -26,10 +11,9 @@ ctypedef fused scalar_t: int32_t uint64_t - -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # IntervalTree -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- cdef class IntervalTree(IntervalMixin): """A centered interval tree @@ -203,9 +187,10 @@ cdef sort_values_and_indices(all_values, all_indices, subset): sorted_indices = take(indices, sorter) return sorted_values, sorted_indices -#---------------------------------------------------------------------- + +# ---------------------------------------------------------------------- # Nodes -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # we need specialized nodes and leaves to optimize for different dtype and # closed values diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 748f3f265dd34..54dfeeff1452d 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -10,10 +10,6 @@ from numpy cimport (ndarray, uint32_t, uint64_t, float32_t, float64_t) cnp.import_array() - -cdef double NaN = np.NaN -cdef double nan = NaN - from pandas._libs.algos import groupsort_indexer, ensure_platform_int from pandas.core.algorithms import take_nd @@ -673,7 +669,7 @@ ctypedef fused asof_t: int32_t int64_t float - double + float64_t ctypedef fused by_t: object diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a9e0fcbc4a826..cfc60256e97a3 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -45,13 +45,14 @@ cdef extern from "numpy/arrayobject.h": cdef extern from "src/parse_helper.h": - int floatify(object, double *result, int *maybe_int) except -1 + int floatify(object, float64_t *result, int *maybe_int) except -1 cimport util from util cimport (is_nan, UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN) from tslib import array_to_datetime +from tslibs.nattype cimport NPY_NAT from tslibs.nattype import NaT from tslibs.conversion cimport convert_to_tsobject from tslibs.timedeltas cimport convert_to_timedelta64 @@ -67,11 +68,8 @@ cdef object oINT64_MAX = INT64_MAX cdef object oINT64_MIN = INT64_MIN cdef object oUINT64_MAX = UINT64_MAX -cdef int64_t NPY_NAT = util.get_nat() -iNaT = util.get_nat() - cdef bint PY2 = sys.version_info[0] == 2 -cdef double nan = np.NaN +cdef float64_t NaN = np.NaN def values_from_object(obj: object): @@ -104,7 +102,7 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t: # ---------------------------------------------------------------------- -def is_scalar(val: object) -> bint: +def is_scalar(val: object) -> bool: """ Return True if given value is scalar. @@ -628,7 +626,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, nat_count = 0 if hasnans: - mask = values == iNaT + mask = values == NPY_NAT nat_count = np.sum(mask) values = values[~mask] @@ -1816,7 +1814,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, if val.__hash__ is not None and val in na_values: seen.saw_null() - floats[i] = complexes[i] = nan + floats[i] = complexes[i] = NaN elif util.is_float_object(val): fval = val if fval != fval: @@ -1847,11 +1845,11 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, seen.bool_ = True elif val is None: seen.saw_null() - floats[i] = complexes[i] = nan + floats[i] = complexes[i] = NaN elif hasattr(val, '__len__') and len(val) == 0: if convert_empty or seen.coerce_numeric: seen.saw_null() - floats[i] = complexes[i] = nan + floats[i] = complexes[i] = NaN else: raise ValueError('Empty string encountered') elif util.is_complex_object(val): @@ -1866,7 +1864,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, if fval in na_values: seen.saw_null() - floats[i] = complexes[i] = nan + floats[i] = complexes[i] = NaN else: if fval != fval: seen.null_ = True @@ -1899,7 +1897,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, elif "uint64" in str(e): # Exception from check functions. raise seen.saw_null() - floats[i] = nan + floats[i] = NaN if seen.check_uint64_conflict(): return values @@ -1967,10 +1965,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, floats[i] = complexes[i] = fnan elif val is NaT: if convert_datetime: - idatetimes[i] = iNaT + idatetimes[i] = NPY_NAT seen.datetime_ = 1 if convert_timedelta: - itimedeltas[i] = iNaT + itimedeltas[i] = NPY_NAT seen.timedelta_ = 1 if not (convert_datetime or convert_timedelta): seen.object_ = 1 diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index b8791359241ad..1fdb04dd10d8e 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -5,16 +5,17 @@ from cython import Py_ssize_t import numpy as np cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t +from numpy cimport ndarray, int64_t, uint8_t, float64_t cnp.import_array() cimport util from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value +from tslibs.nattype cimport checknull_with_nat from tslibs.nattype import NaT -cdef double INF = np.inf -cdef double NEGINF = -INF +cdef float64_t INF = np.inf +cdef float64_t NEGINF = -INF cdef int64_t NPY_NAT = util.get_nat() @@ -295,9 +296,7 @@ def isneginf_scalar(val: object) -> bool: cdef inline bint is_null_datetime64(v): # determine if we have a null for a datetime (or integer versions), # excluding np.timedelta64('nat') - if v is None or util.is_nan(v): - return True - elif v is NaT: + if checknull_with_nat(v): return True elif util.is_datetime64_object(v): return v.view('int64') == NPY_NAT @@ -307,9 +306,7 @@ cdef inline bint is_null_datetime64(v): cdef inline bint is_null_timedelta64(v): # determine if we have a null for a timedelta (or integer versions), # excluding np.datetime64('nat') - if v is None or util.is_nan(v): - return True - elif v is NaT: + if checknull_with_nat(v): return True elif util.is_timedelta64_object(v): return v.view('int64') == NPY_NAT @@ -319,8 +316,4 @@ cdef inline bint is_null_timedelta64(v): cdef inline bint is_null_period(v): # determine if we have a null for a Period (or integer versions), # excluding np.datetime64('nat') and np.timedelta64('nat') - if v is None or util.is_nan(v): - return True - elif v is NaT: - return True - return False + return checknull_with_nat(v) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 391de339ad60e..3870a55c22fd6 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -65,8 +65,8 @@ CParserError = ParserError cdef bint PY3 = (sys.version_info[0] >= 3) -cdef double INF = np.inf -cdef double NEGINF = -INF +cdef float64_t INF = np.inf +cdef float64_t NEGINF = -INF cdef extern from "errno.h": @@ -182,10 +182,10 @@ cdef extern from "parser/tokenizer.h": int64_t skip_first_N_rows int64_t skipfooter # pick one, depending on whether the converter requires GIL - double (*double_converter_nogil)(const char *, char **, - char, char, char, int) nogil - double (*double_converter_withgil)(const char *, char **, - char, char, char, int) + float64_t (*double_converter_nogil)(const char *, char **, + char, char, char, int) nogil + float64_t (*double_converter_withgil)(const char *, char **, + char, char, char, int) # error handling char *warn_msg @@ -233,12 +233,12 @@ cdef extern from "parser/tokenizer.h": uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) nogil - double xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing) nogil - double precise_xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing) nogil - double round_trip(const char *p, char **q, char decimal, char sci, + float64_t xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing) nogil + float64_t precise_xstrtod(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing) nogil + float64_t round_trip(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing) nogil int to_boolean(const char *item, uint8_t *val) nogil @@ -1697,8 +1697,8 @@ cdef _try_double(parser_t *parser, int64_t col, coliter_t it const char *word = NULL char *p_end - double *data - double NA = na_values[np.float64] + float64_t *data + float64_t NA = na_values[np.float64] kh_float64_t *na_fset ndarray result khiter_t k @@ -1706,7 +1706,7 @@ cdef _try_double(parser_t *parser, int64_t col, lines = line_end - line_start result = np.empty(lines, dtype=np.float64) - data = result.data + data = result.data na_fset = kset_float64_from_list(na_flist) if parser.double_converter_nogil != NULL: # if it can run without the GIL with nogil: @@ -1717,8 +1717,8 @@ cdef _try_double(parser_t *parser, int64_t col, else: assert parser.double_converter_withgil != NULL error = _try_double_nogil(parser, - parser.double_converter_withgil, col, line_start, line_end, na_filter, na_hashset, use_na_flist, @@ -1730,14 +1730,14 @@ cdef _try_double(parser_t *parser, int64_t col, cdef inline int _try_double_nogil(parser_t *parser, - double (*double_converter)( + float64_t (*double_converter)( const char *, char **, char, char, char, int) nogil, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset, bint use_na_flist, const kh_float64_t *na_flist, - double NA, double *data, + float64_t NA, float64_t *data, int *na_count) nogil: cdef: int error, diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index b8ca744ac88c4..668bd0ae6bbb7 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -22,9 +22,6 @@ _np_version_under1p11 = LooseVersion(_np_version) < LooseVersion('1.11') cdef float64_t NaN = np.NaN cdef float64_t INF = np.inf -cdef inline int int_max(int a, int b): return a if a >= b else b -cdef inline int int_min(int a, int b): return a if a <= b else b - # ----------------------------------------------------------------------------- @@ -673,13 +670,6 @@ cdef class BlockMerge(object): self.yi = xi -cdef class BlockIntersection(BlockMerge): - """ - not done yet - """ - pass - - cdef class BlockUnion(BlockMerge): """ Object-oriented approach makes sharing state between recursive functions a @@ -805,63 +795,6 @@ cdef class BlockUnion(BlockMerge): include "sparse_op_helper.pxi" -# ----------------------------------------------------------------------------- -# Indexing operations - -def get_reindexer(ndarray[object, ndim=1] values, dict index_map): - cdef: - object idx - Py_ssize_t i - Py_ssize_t new_length = len(values) - ndarray[int32_t, ndim=1] indexer - - indexer = np.empty(new_length, dtype=np.int32) - - for i in range(new_length): - idx = values[i] - if idx in index_map: - indexer[i] = index_map[idx] - else: - indexer[i] = -1 - - return indexer - -# def reindex_block(ndarray[float64_t, ndim=1] values, -# BlockIndex sparse_index, -# ndarray[int32_t, ndim=1] indexer): -# cdef: -# Py_ssize_t i, length -# ndarray[float64_t, ndim=1] out - -# out = np.empty(length, dtype=np.float64) - -# for i in range(length): -# if indexer[i] == -1: -# pass - - -# cdef class SparseCruncher(object): -# """ -# Class to acquire float pointer for convenient operations on sparse data -# structures -# """ -# cdef: -# SparseIndex index -# float64_t* buf - -# def __init__(self, ndarray[float64_t, ndim=1, mode='c'] values, -# SparseIndex index): - -# self.index = index -# self.buf = values.data - - -def reindex_integer(ndarray[float64_t, ndim=1] values, - IntIndex sparse_index, - ndarray[int32_t, ndim=1] indexer): - pass - - # ----------------------------------------------------------------------------- # SparseArray mask create operations diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index d02a985de1d61..1f41096a3f194 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -4,9 +4,9 @@ Template for each `dtype` helper function for sparse ops WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Sparse op -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- ctypedef fused sparse_t: float64_t diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9012ebefe0975..e346eb7e598ed 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import cython from cython import Py_ssize_t from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, @@ -37,7 +38,8 @@ from tslibs.conversion cimport (tz_convert_single, _TSObject, get_datetime64_nanos, tz_convert_utc_to_tzlocal) -from tslibs.nattype import NaT, nat_strings, iNaT +# many modules still look for NaT and iNaT here despite them not being needed +from tslibs.nattype import nat_strings, NaT, iNaT # noqa:F821 from tslibs.nattype cimport checknull_with_nat, NPY_NAT from tslibs.offsets cimport to_offset @@ -71,6 +73,8 @@ cdef inline object create_time_from_ts( return time(dts.hour, dts.min, dts.sec, dts.us, tz) +@cython.wraparound(False) +@cython.boundscheck(False) def ints_to_pydatetime(int64_t[:] arr, tz=None, freq=None, box="datetime"): """ Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp @@ -213,6 +217,8 @@ def _test_parse_iso8601(object ts): return Timestamp(obj.value) +@cython.wraparound(False) +@cython.boundscheck(False) def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object format=None, object na_rep=None): """ @@ -335,7 +341,7 @@ def array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): # then need to iterate try: iresult = values.astype('i8', casting='same_kind', copy=False) - mask = iresult == iNaT + mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False @@ -351,7 +357,7 @@ def array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): "'{unit}'".format(unit=unit)) result = (iresult * m).astype('M8[ns]') iresult = result.view('i8') - iresult[mask] = iNaT + iresult[mask] = NPY_NAT return result result = np.empty(n, dtype='M8[ns]') @@ -449,6 +455,8 @@ def array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): return oresult +@cython.wraparound(False) +@cython.boundscheck(False) cpdef array_to_datetime(ndarray[object] values, errors='raise', dayfirst=False, yearfirst=False, format=None, utc=None, @@ -752,6 +760,8 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return array_to_datetime_object(values, is_raise, dayfirst, yearfirst) +@cython.wraparound(False) +@cython.boundscheck(False) cdef array_to_datetime_object(ndarray[object] values, bint is_raise, dayfirst=False, yearfirst=False): """ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0495202818eb5..7ef38cba0c37f 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -308,8 +308,13 @@ class _BaseOffset(object): def __eq__(self, other): if is_string_object(other): - other = to_offset(other) - + try: + # GH#23524 if to_offset fails, we are dealing with an + # incomparable type so == is False and != is True + other = to_offset(other) + except ValueError: + # e.g. "infer" + return False try: return self._params == other._params except AttributeError: diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index e2914957d01cd..457f5003cb9a5 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import enum import warnings from cpython cimport (PyObject_RichCompareBool, PyObject_RichCompare, @@ -23,7 +24,6 @@ cimport ccalendar from conversion import tz_localize_to_utc, normalize_i8_timestamps from conversion cimport (tz_convert_single, _TSObject, convert_to_tsobject, convert_datetime_to_tsobject) -import enum from fields import get_start_end_field, get_date_name_field from nattype import NaT from nattype cimport NPY_NAT diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index bb7af67d14585..f517e0933264a 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -9,15 +9,15 @@ from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp -from numpy cimport ndarray, double_t, int64_t, float64_t, float32_t +from numpy cimport ndarray, int64_t, float64_t, float32_t cnp.import_array() cdef extern from "src/headers/cmath" namespace "std": - bint isnan(double) nogil - bint notnan(double) nogil - int signbit(double) nogil - double sqrt(double x) nogil + bint isnan(float64_t) nogil + bint notnan(float64_t) nogil + int signbit(float64_t) nogil + float64_t sqrt(float64_t x) nogil cimport util from util cimport numeric @@ -32,7 +32,7 @@ cdef float64_t MINfloat64 = np.NINF cdef float32_t MAXfloat32 = np.inf cdef float64_t MAXfloat64 = np.inf -cdef double NaN = np.NaN +cdef float64_t NaN = np.NaN cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b @@ -80,6 +80,7 @@ def _check_minp(win, minp, N, floor=None): return max(minp, floor) + # original C implementation by N. Devillard. # This code in public domain. # Function : kth_smallest() @@ -352,19 +353,20 @@ def get_window_indexer(values, win, minp, index, closed, right_closed, index, floor) return indexer.get_data() + # ---------------------------------------------------------------------- # Rolling count # this is only an impl for index not None, IOW, freq aware -def roll_count(ndarray[double_t] values, int64_t win, int64_t minp, +def roll_count(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, count_x = 0.0 + float64_t val, count_x = 0.0 int64_t s, e, nobs, N Py_ssize_t i, j ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output start, end, N, win, minp, _ = get_window_indexer(values, win, minp, index, closed) @@ -406,12 +408,15 @@ def roll_count(ndarray[double_t] values, int64_t win, int64_t minp, return output + # ---------------------------------------------------------------------- # Rolling sum -cdef inline double calc_sum(int64_t minp, int64_t nobs, double sum_x) nogil: - cdef double result +cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, + float64_t sum_x) nogil: + cdef: + float64_t result if nobs >= minp: result = sum_x @@ -421,7 +426,7 @@ cdef inline double calc_sum(int64_t minp, int64_t nobs, double sum_x) nogil: return result -cdef inline void add_sum(double val, int64_t *nobs, double *sum_x) nogil: +cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: """ add a value from the sum calc """ # Not NaN @@ -430,7 +435,8 @@ cdef inline void add_sum(double val, int64_t *nobs, double *sum_x) nogil: sum_x[0] = sum_x[0] + val -cdef inline void remove_sum(double val, int64_t *nobs, double *sum_x) nogil: +cdef inline void remove_sum(float64_t val, + int64_t *nobs, float64_t *sum_x) nogil: """ remove a value from the sum calc """ if notnan(val): @@ -438,15 +444,15 @@ cdef inline void remove_sum(double val, int64_t *nobs, double *sum_x) nogil: sum_x[0] = sum_x[0] - val -def roll_sum(ndarray[double_t] values, int64_t win, int64_t minp, +def roll_sum(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, prev_x, sum_x = 0 + float64_t val, prev_x, sum_x = 0 int64_t s, e, range_endpoint int64_t nobs = 0, i, j, N bint is_variable ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, @@ -511,16 +517,18 @@ def roll_sum(ndarray[double_t] values, int64_t win, int64_t minp, return output + # ---------------------------------------------------------------------- # Rolling mean -cdef inline double calc_mean(int64_t minp, Py_ssize_t nobs, - Py_ssize_t neg_ct, double sum_x) nogil: - cdef double result +cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, + Py_ssize_t neg_ct, float64_t sum_x) nogil: + cdef: + float64_t result if nobs >= minp: - result = sum_x / nobs + result = sum_x / nobs if neg_ct == 0 and result < 0: # all positive result = 0 @@ -534,7 +542,7 @@ cdef inline double calc_mean(int64_t minp, Py_ssize_t nobs, return result -cdef inline void add_mean(double val, Py_ssize_t *nobs, double *sum_x, +cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, Py_ssize_t *neg_ct) nogil: """ add a value from the mean calc """ @@ -546,7 +554,7 @@ cdef inline void add_mean(double val, Py_ssize_t *nobs, double *sum_x, neg_ct[0] = neg_ct[0] + 1 -cdef inline void remove_mean(double val, Py_ssize_t *nobs, double *sum_x, +cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, Py_ssize_t *neg_ct) nogil: """ remove a value from the mean calc """ @@ -557,15 +565,15 @@ cdef inline void remove_mean(double val, Py_ssize_t *nobs, double *sum_x, neg_ct[0] = neg_ct[0] - 1 -def roll_mean(ndarray[double_t] values, int64_t win, int64_t minp, +def roll_mean(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, prev_x, result, sum_x = 0 + float64_t val, prev_x, result, sum_x = 0 int64_t s, e bint is_variable Py_ssize_t nobs = 0, i, j, neg_ct = 0, N ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, @@ -627,13 +635,15 @@ def roll_mean(ndarray[double_t] values, int64_t win, int64_t minp, return output + # ---------------------------------------------------------------------- # Rolling variance -cdef inline double calc_var(int64_t minp, int ddof, double nobs, - double ssqdm_x) nogil: - cdef double result +cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, + float64_t ssqdm_x) nogil: + cdef: + float64_t result # Variance is unchanged if no observation is added or removed if (nobs >= minp) and (nobs > ddof): @@ -642,7 +652,7 @@ cdef inline double calc_var(int64_t minp, int ddof, double nobs, if nobs == 1: result = 0 else: - result = ssqdm_x / (nobs - ddof) + result = ssqdm_x / (nobs - ddof) if result < 0: result = 0 else: @@ -651,10 +661,12 @@ cdef inline double calc_var(int64_t minp, int ddof, double nobs, return result -cdef inline void add_var(double val, double *nobs, double *mean_x, - double *ssqdm_x) nogil: +cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, + float64_t *ssqdm_x) nogil: """ add a value from the var calc """ - cdef double delta + cdef: + float64_t delta + # `isnan` instead of equality as fix for GH-21813, msvc 2017 bug if isnan(val): return @@ -667,10 +679,11 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] -cdef inline void remove_var(double val, double *nobs, double *mean_x, - double *ssqdm_x) nogil: +cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x, + float64_t *ssqdm_x) nogil: """ remove a value from the var calc """ - cdef double delta + cdef: + float64_t delta if notnan(val): nobs[0] = nobs[0] - 1 @@ -685,18 +698,19 @@ cdef inline void remove_var(double val, double *nobs, double *mean_x, ssqdm_x[0] = 0 -def roll_var(ndarray[double_t] values, int64_t win, int64_t minp, +def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed, int ddof=1): """ Numerically stable implementation using Welford's method. """ cdef: - double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta, mean_x_old + float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, + float64_t val, prev, delta, mean_x_old int64_t s, e bint is_variable Py_ssize_t i, j, N ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, @@ -785,13 +799,15 @@ def roll_var(ndarray[double_t] values, int64_t win, int64_t minp, # ---------------------------------------------------------------------- # Rolling skewness -cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, - double xxx) nogil: - cdef double result, dnobs - cdef double A, B, C, R +cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, + float64_t x, float64_t xx, + float64_t xxx) nogil: + cdef: + float64_t result, dnobs + float64_t A, B, C, R if nobs >= minp: - dnobs = nobs + dnobs = nobs A = x / dnobs B = xx / dnobs - A * A C = xxx / dnobs - A * A * A - 3 * A * B @@ -817,8 +833,9 @@ cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, return result -cdef inline void add_skew(double val, int64_t *nobs, double *x, double *xx, - double *xxx) nogil: +cdef inline void add_skew(float64_t val, int64_t *nobs, + float64_t *x, float64_t *xx, + float64_t *xxx) nogil: """ add a value from the skew calc """ # Not NaN @@ -831,8 +848,9 @@ cdef inline void add_skew(double val, int64_t *nobs, double *x, double *xx, xxx[0] = xxx[0] + val * val * val -cdef inline void remove_skew(double val, int64_t *nobs, double *x, double *xx, - double *xxx) nogil: +cdef inline void remove_skew(float64_t val, int64_t *nobs, + float64_t *x, float64_t *xx, + float64_t *xxx) nogil: """ remove a value from the skew calc """ # Not NaN @@ -845,16 +863,16 @@ cdef inline void remove_skew(double val, int64_t *nobs, double *x, double *xx, xxx[0] = xxx[0] - val * val * val -def roll_skew(ndarray[double_t] values, int64_t win, int64_t minp, +def roll_skew(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, prev - double x = 0, xx = 0, xxx = 0 + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0 int64_t nobs = 0, i, j, N int64_t s, e bint is_variable ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, @@ -915,17 +933,20 @@ def roll_skew(ndarray[double_t] values, int64_t win, int64_t minp, return output + # ---------------------------------------------------------------------- # Rolling kurtosis -cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, - double xxx, double xxxx) nogil: - cdef double result, dnobs - cdef double A, B, C, D, R, K +cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs, + float64_t x, float64_t xx, + float64_t xxx, float64_t xxxx) nogil: + cdef: + float64_t result, dnobs + float64_t A, B, C, D, R, K if nobs >= minp: - dnobs = nobs + dnobs = nobs A = x / dnobs R = A * A B = xx / dnobs - R @@ -954,8 +975,9 @@ cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, return result -cdef inline void add_kurt(double val, int64_t *nobs, double *x, double *xx, - double *xxx, double *xxxx) nogil: +cdef inline void add_kurt(float64_t val, int64_t *nobs, + float64_t *x, float64_t *xx, + float64_t *xxx, float64_t *xxxx) nogil: """ add a value from the kurotic calc """ # Not NaN @@ -969,8 +991,9 @@ cdef inline void add_kurt(double val, int64_t *nobs, double *x, double *xx, xxxx[0] = xxxx[0] + val * val * val * val -cdef inline void remove_kurt(double val, int64_t *nobs, double *x, double *xx, - double *xxx, double *xxxx) nogil: +cdef inline void remove_kurt(float64_t val, int64_t *nobs, + float64_t *x, float64_t *xx, + float64_t *xxx, float64_t *xxxx) nogil: """ remove a value from the kurotic calc """ # Not NaN @@ -984,16 +1007,16 @@ cdef inline void remove_kurt(double val, int64_t *nobs, double *x, double *xx, xxxx[0] = xxxx[0] - val * val * val * val -def roll_kurt(ndarray[double_t] values, int64_t win, int64_t minp, +def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, prev - double x = 0, xx = 0, xxx = 0, xxxx = 0 + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 int64_t nobs = 0, i, j, N int64_t s, e bint is_variable ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, @@ -1050,6 +1073,7 @@ def roll_kurt(ndarray[double_t] values, int64_t win, int64_t minp, return output + # ---------------------------------------------------------------------- # Rolling median, min, max @@ -1057,7 +1081,7 @@ def roll_kurt(ndarray[double_t] values, int64_t win, int64_t minp, def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, res, prev + float64_t val, res, prev bint err = 0, is_variable int ret = 0 skiplist_t *sl @@ -1065,7 +1089,7 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, int64_t nobs = 0, N, s, e int midpoint ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs @@ -1130,6 +1154,7 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, raise MemoryError("skiplist_insert failed") return output + # ---------------------------------------------------------------------- # Moving maximum / minimum code taken from Bottleneck under the terms @@ -1167,7 +1192,8 @@ cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil: cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, numeric value) nogil: - cdef numeric result + cdef: + numeric result if numeric in cython.floating: if nobs >= minp: @@ -1252,7 +1278,7 @@ cdef _roll_min_max_variable(ndarray[numeric] values, Py_ssize_t nobs = 0 deque Q[int64_t] # min/max always the front deque W[int64_t] # track the whole window for nobs compute - ndarray[double_t, ndim=1] output + ndarray[float64_t, ndim=1] output output = np.empty(N, dtype=float) Q = deque[int64_t]() @@ -1335,7 +1361,7 @@ cdef _roll_min_max_fixed(ndarray[numeric] values, numeric* minvalue numeric* end numeric* last - ndarray[double_t, ndim=1] output + ndarray[float64_t, ndim=1] output output = np.empty(N, dtype=float) # setup the rings of death! @@ -1427,19 +1453,19 @@ interpolation_types = { def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, int64_t minp, object index, object closed, - double quantile, str interpolation): + float64_t quantile, str interpolation): """ O(N log(window)) implementation using skip list """ cdef: - double val, prev, midpoint, idx_with_fraction + float64_t val, prev, midpoint, idx_with_fraction skiplist_t *skiplist int64_t nobs = 0, i, j, s, e, N Py_ssize_t idx bint is_variable ndarray[int64_t] start, end - ndarray[double_t] output - double vlow, vhigh + ndarray[float64_t] output + float64_t vlow, vhigh InterpolationType interpolation_type int ret = 0 @@ -1529,7 +1555,7 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, elif interpolation_type == MIDPOINT: vlow = skiplist_get(skiplist, idx, &ret) vhigh = skiplist_get(skiplist, idx + 1, &ret) - output[i] = (vlow + vhigh) / 2 + output[i] = (vlow + vhigh) / 2 else: output[i] = NaN @@ -1543,7 +1569,7 @@ def roll_generic(object obj, int offset, object func, bint raw, object args, object kwargs): cdef: - ndarray[double_t] output, counts, bufarr + ndarray[float64_t] output, counts, bufarr ndarray[float64_t, cast=True] arr float64_t *buf float64_t *oldbuf @@ -1642,7 +1668,7 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] values, Assume len(weights) << len(values) """ cdef: - ndarray[double_t] output, tot_wgt, counts + ndarray[float64_t] output, tot_wgt, counts Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k float64_t val_in, val_win, c, w @@ -1703,7 +1729,8 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] values, # Exponentially weighted moving average -def ewma(double_t[:] vals, double_t com, int adjust, int ignore_na, int minp): +def ewma(float64_t[:] vals, float64_t com, + int adjust, int ignore_na, int minp): """ Compute exponentially-weighted moving average using center-of-mass. @@ -1722,8 +1749,8 @@ def ewma(double_t[:] vals, double_t com, int adjust, int ignore_na, int minp): cdef: Py_ssize_t N = len(vals) - ndarray[double_t] output = np.empty(N, dtype=float) - double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur + ndarray[float64_t] output = np.empty(N, dtype=float) + float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur Py_ssize_t i, nobs if N == 0: @@ -1767,12 +1794,13 @@ def ewma(double_t[:] vals, double_t com, int adjust, int ignore_na, int minp): return output + # ---------------------------------------------------------------------- # Exponentially weighted moving covariance -def ewmcov(double_t[:] input_x, double_t[:] input_y, - double_t com, int adjust, int ignore_na, int minp, int bias): +def ewmcov(float64_t[:] input_x, float64_t[:] input_y, + float64_t com, int adjust, int ignore_na, int minp, int bias): """ Compute exponentially-weighted moving variance using center-of-mass. @@ -1793,10 +1821,10 @@ def ewmcov(double_t[:] input_x, double_t[:] input_y, cdef: Py_ssize_t N = len(input_x) - double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov - double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov + float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y Py_ssize_t i, nobs - ndarray[double_t] output + ndarray[float64_t] output if len(input_y) != N: raise ValueError("arrays are of different lengths " diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 3fa4f503d2dd5..daf2dcccd284b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -124,8 +124,12 @@ def asi8(self): # do not cache or you'll create a memory leak return self._data.view('i8') - # ------------------------------------------------------------------ - # Array-like Methods + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + @property + def nbytes(self): + return self._data.nbytes @property def shape(self): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 405056c628ceb..a6f688fb0cf7a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( _NS_DTYPE, is_object_dtype, + is_int64_dtype, is_datetime64tz_dtype, is_datetime64_dtype, ensure_int64) @@ -233,9 +234,7 @@ def __new__(cls, values, freq=None, tz=None, dtype=None): result = cls._simple_new(values, freq=freq, tz=tz) if freq_infer: - inferred = result.inferred_freq - if inferred: - result.freq = to_offset(inferred) + result.freq = to_offset(result.inferred_freq) # NB: Among other things not yet ported from the DatetimeIndex # constructor, this does not call _deepcopy_if_needed @@ -386,7 +385,16 @@ def _resolution(self): return libresolution.resolution(self.asi8, self.tz) # ---------------------------------------------------------------- - # Array-like Methods + # Array-Like / EA-Interface Methods + + def __array__(self, dtype=None): + if is_object_dtype(dtype): + return np.array(list(self), dtype=object) + elif is_int64_dtype(dtype): + return self.asi8 + + # TODO: warn that conversion may be lossy? + return self._data.view(np.ndarray) # follow Index.__array__ def __iter__(self): """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 6d7ab6e6176d1..1a6c10ee05261 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -272,10 +272,6 @@ def _concat_same_type(cls, to_concat): # -------------------------------------------------------------------- # Data / Attributes - @property - def nbytes(self): - # TODO(DatetimeArray): remove - return self._data.nbytes @cache_readonly def dtype(self): @@ -286,10 +282,6 @@ def _ndarray_values(self): # Ordinals return self._data - @property - def asi8(self): - return self._data - @property def freq(self): """Return the frequency object for this PeriodArray.""" @@ -330,6 +322,58 @@ def start_time(self): def end_time(self): return self.to_timestamp(how='end') + def to_timestamp(self, freq=None, how='start'): + """ + Cast to DatetimeArray/Index. + + Parameters + ---------- + freq : string or DateOffset, optional + Target frequency. The default is 'D' for week or longer, + 'S' otherwise + how : {'s', 'e', 'start', 'end'} + + Returns + ------- + DatetimeArray/Index + """ + from pandas.core.arrays import DatetimeArrayMixin + + how = libperiod._validate_end_alias(how) + + end = how == 'E' + if end: + if freq == 'B': + # roll forward to ensure we land on B date + adjust = Timedelta(1, 'D') - Timedelta(1, 'ns') + return self.to_timestamp(how='start') + adjust + else: + adjust = Timedelta(1, 'ns') + return (self + self.freq).to_timestamp(how='start') - adjust + + if freq is None: + base, mult = frequencies.get_freq_code(self.freq) + freq = frequencies.get_to_timestamp_base(base) + else: + freq = Period._maybe_convert_freq(freq) + + base, mult = frequencies.get_freq_code(freq) + new_data = self.asfreq(freq, how=how) + + new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) + return DatetimeArrayMixin(new_data, freq='infer') + + # -------------------------------------------------------------------- + # Array-like / EA-Interface Methods + + def __repr__(self): + return '<{}>\n{}\nLength: {}, dtype: {}'.format( + self.__class__.__name__, + [str(s) for s in self], + len(self), + self.dtype + ) + def _formatter(self, formatter=None): if formatter: return formatter.formatter or str @@ -453,6 +497,8 @@ def value_counts(self, dropna=False): name=result.index.name) return Series(result.values, index=index, name=result.name) + # -------------------------------------------------------------------- + def shift(self, periods=1): """ Shift values by desired number. @@ -564,49 +610,9 @@ def asfreq(self, freq=None, how='E'): return type(self)(new_data, freq=freq) - def to_timestamp(self, freq=None, how='start'): - """ - Cast to DatetimeArray/Index - - Parameters - ---------- - freq : string or DateOffset, optional - Target frequency. The default is 'D' for week or longer, - 'S' otherwise - how : {'s', 'e', 'start', 'end'} - - Returns - ------- - DatetimeArray/Index - """ - from pandas.core.arrays import DatetimeArrayMixin - - how = libperiod._validate_end_alias(how) - - end = how == 'E' - if end: - if freq == 'B': - # roll forward to ensure we land on B date - adjust = Timedelta(1, 'D') - Timedelta(1, 'ns') - return self.to_timestamp(how='start') + adjust - else: - adjust = Timedelta(1, 'ns') - return (self + self.freq).to_timestamp(how='start') - adjust - - if freq is None: - base, mult = frequencies.get_freq_code(self.freq) - freq = frequencies.get_to_timestamp_base(base) - else: - freq = Period._maybe_convert_freq(freq) - - base, mult = frequencies.get_freq_code(freq) - new_data = self.asfreq(freq, how=how) - - new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) - return DatetimeArrayMixin(new_data, freq='infer') - # ------------------------------------------------------------------ # Formatting + def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): """ actually format my specific types """ # TODO(DatetimeArray): remove @@ -627,9 +633,13 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): values = np.array([formatter(dt) for dt in values]) return values + # Delegation... + def strftime(self, date_format): + return self._format_native_types(date_format=date_format) + def repeat(self, repeats, *args, **kwargs): """ - Repeat elements of a Categorical. + Repeat elements of a PeriodArray. See also -------- @@ -640,10 +650,6 @@ def repeat(self, repeats, *args, **kwargs): values = self._data.repeat(repeats) return type(self)(values, self.freq) - # Delegation... - def strftime(self, date_format): - return self._format_native_types(date_format=date_format) - def astype(self, dtype, copy=True): # TODO: Figure out something better here... # We have DatetimeLikeArrayMixin -> diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index cf3ba263d1f81..9dbdd6ff8b562 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1,18 +1,28 @@ # -*- coding: utf-8 -*- from datetime import timedelta +import warnings import numpy as np from pandas._libs import tslibs -from pandas._libs.tslibs import Timedelta, Timestamp, NaT +from pandas._libs.tslibs import Timedelta, Timestamp, NaT, iNaT from pandas._libs.tslibs.fields import get_timedelta_field -from pandas._libs.tslibs.timedeltas import array_to_timedelta64 +from pandas._libs.tslibs.timedeltas import ( + array_to_timedelta64, parse_timedelta_unit) from pandas import compat from pandas.core.dtypes.common import ( - _TD_DTYPE, is_list_like) -from pandas.core.dtypes.generic import ABCSeries + _TD_DTYPE, + is_object_dtype, + is_string_dtype, + is_float_dtype, + is_integer_dtype, + is_timedelta64_dtype, + is_datetime64_dtype, + is_list_like, + ensure_int64) +from pandas.core.dtypes.generic import ABCSeries, ABCTimedeltaIndex from pandas.core.dtypes.missing import isna import pandas.core.common as com @@ -139,9 +149,7 @@ def __new__(cls, values, freq=None): result = cls._simple_new(values, freq=freq) if freq_infer: - inferred = result.inferred_freq - if inferred: - result.freq = to_offset(inferred) + result.freq = to_offset(result.inferred_freq) return result @@ -182,6 +190,9 @@ def _generate_range(cls, start, end, periods, freq, closed=None): return cls._simple_new(index, freq=freq) + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + # ---------------------------------------------------------------- # Arithmetic Methods @@ -397,6 +408,172 @@ def f(x): # --------------------------------------------------------------------- # Constructor Helpers +def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): + """ + Parameters + ---------- + array : list-like + copy : bool, default False + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. + + Returns + ------- + converted : numpy.ndarray + The sequence converted to a numpy array with dtype ``timedelta64[ns]``. + inferred_freq : Tick or None + The inferred frequency of the sequence. + + Raises + ------ + ValueError : Data cannot be converted to timedelta64[ns]. + + Notes + ----- + Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause + errors to be ignored; they are caught and subsequently ignored at a + higher level. + """ + inferred_freq = None + unit = parse_timedelta_unit(unit) + + # Unwrap whatever we have into a np.ndarray + if not hasattr(data, 'dtype'): + # e.g. list, tuple + if np.ndim(data) == 0: + # i.e. generator + data = list(data) + data = np.array(data, copy=False) + elif isinstance(data, ABCSeries): + data = data._values + elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArrayMixin)): + inferred_freq = data.freq + data = data._data + + # Convert whatever we have into timedelta64[ns] dtype + if is_object_dtype(data) or is_string_dtype(data): + # no need to make a copy, need to convert if string-dtyped + data = objects_to_td64ns(data, unit=unit, errors=errors) + copy = False + + elif is_integer_dtype(data): + # treat as multiples of the given unit + data, copy_made = ints_to_td64ns(data, unit=unit) + copy = copy and not copy_made + + elif is_float_dtype(data): + # treat as multiples of the given unit. If after converting to nanos, + # there are fractional components left, these are truncated + # (i.e. NOT rounded) + mask = np.isnan(data) + coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns') + data = (coeff * data).astype(np.int64).view('timedelta64[ns]') + data[mask] = iNaT + copy = False + + elif is_timedelta64_dtype(data): + if data.dtype != _TD_DTYPE: + # non-nano unit + # TODO: watch out for overflows + data = data.astype(_TD_DTYPE) + copy = False + + elif is_datetime64_dtype(data): + # GH#23539 + warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is " + "deprecated, will raise a TypeError in a future " + "version", + FutureWarning, stacklevel=3) + data = ensure_int64(data).view(_TD_DTYPE) + + else: + raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]" + .format(dtype=data.dtype)) + + data = np.array(data, copy=copy) + assert data.dtype == 'm8[ns]', data + return data, inferred_freq + + +def ints_to_td64ns(data, unit="ns"): + """ + Convert an ndarray with integer-dtype to timedelta64[ns] dtype, treating + the integers as multiples of the given timedelta unit. + + Parameters + ---------- + data : numpy.ndarray with integer-dtype + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + + Returns + ------- + numpy.ndarray : timedelta64[ns] array converted from data + bool : whether a copy was made + """ + copy_made = False + unit = unit if unit is not None else "ns" + + if data.dtype != np.int64: + # converting to int64 makes a copy, so we can avoid + # re-copying later + data = data.astype(np.int64) + copy_made = True + + if unit != "ns": + dtype_str = "timedelta64[{unit}]".format(unit=unit) + data = data.view(dtype_str) + + # TODO: watch out for overflows when converting from lower-resolution + data = data.astype("timedelta64[ns]") + # the astype conversion makes a copy, so we can avoid re-copying later + copy_made = True + + else: + data = data.view("timedelta64[ns]") + + return data, copy_made + + +def objects_to_td64ns(data, unit="ns", errors="raise"): + """ + Convert a object-dtyped or string-dtyped array into an + timedelta64[ns]-dtyped array. + + Parameters + ---------- + data : ndarray or Index + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. + + Returns + ------- + numpy.ndarray : timedelta64[ns] array converted from data + + Raises + ------ + ValueError : Data cannot be converted to timedelta64[ns]. + + Notes + ----- + Unlike `pandas.to_timedelta`, if setting `errors=ignore` will not cause + errors to be ignored; they are caught and subsequently ignored at a + higher level. + """ + # coerce Index to np.ndarray, converting string-dtype if necessary + values = np.array(data, dtype=np.object_, copy=False) + + result = array_to_timedelta64(values, + unit=unit, errors=errors) + return result.view('timedelta64[ns]') + + def _generate_regular_range(start, end, periods, offset): stride = offset.nanos if periods is None: diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b2999c112e8ab..bb4ab823069ee 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -101,27 +101,6 @@ def _get_frame_result_type(result, objs): ABCSparseDataFrame)) -def _get_sliced_frame_result_type(data, obj): - """ - return appropriate class of Series. When data is sparse - it will return a SparseSeries, otherwise it will return - the Series. - - Parameters - ---------- - data : array-like - obj : DataFrame - - Returns - ------- - Series or SparseSeries - """ - if is_sparse(data): - from pandas.core.sparse.api import SparseSeries - return SparseSeries - return obj._constructor_sliced - - def _concat_compat(to_concat, axis=0): """ provide concatenation of an array of arrays each of which is a single diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index d56bd83f01236..5f35a040d7d47 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -73,7 +73,7 @@ def is_string_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Examples -------- @@ -127,7 +127,7 @@ def is_iterator(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -172,7 +172,7 @@ def is_file_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -203,7 +203,7 @@ def is_re(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -227,7 +227,7 @@ def is_re_compilable(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -261,7 +261,7 @@ def is_list_like(obj, allow_sets=True): Parameters ---------- - obj : The object to check. + obj : The object to check allow_sets : boolean, default True If this parameter is False, sets will not be considered list-like @@ -310,7 +310,7 @@ def is_array_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -343,7 +343,7 @@ def is_nested_list_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -384,7 +384,7 @@ def is_dict_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -408,7 +408,7 @@ def is_named_tuple(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -468,7 +468,7 @@ def is_sequence(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b24f79e89902a..f8d153327f135 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -72,7 +72,6 @@ is_iterator, is_sequence, is_named_tuple) -from pandas.core.dtypes.concat import _get_sliced_frame_result_type from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex from pandas.core.dtypes.missing import isna, notna @@ -864,12 +863,17 @@ def iterrows(self): data types, the iterator returns a copy and not a view, and writing to it will have no effect. - Returns - ------- + Yields + ------ + index : label or tuple of label + The index of the row. A tuple for a `MultiIndex`. + data : Series + The data of the row as a Series. + it : generator A generator that iterates over the rows of the frame. - See also + See Also -------- itertuples : Iterate over DataFrame rows as namedtuples of the values. iteritems : Iterate over (column name, Series) pairs. @@ -3236,7 +3240,7 @@ def _box_item_values(self, key, values): def _box_col_values(self, values, items): """ provide boxed values for a column """ - klass = _get_sliced_frame_result_type(values, self) + klass = self._constructor_sliced return klass(values, index=self.index, name=items, fastpath=True) def __setitem__(self, key, value): @@ -3951,6 +3955,10 @@ def set_index(self, keys, drop=True, append=False, inplace=False, necessary. Setting to False will improve the performance of this method + Returns + ------- + DataFrame + Examples -------- >>> df = pd.DataFrame({'month': [1, 4, 7, 10], @@ -3991,10 +3999,6 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 2 2014 4 40 3 2013 7 84 4 2014 10 31 - - Returns - ------- - dataframe : DataFrame """ inplace = validate_bool_kwarg(inplace, 'inplace') if not isinstance(keys, list): @@ -5514,50 +5518,72 @@ def pivot(self, index=None, columns=None, values=None): ... "C": ["small", "large", "large", "small", ... "small", "large", "small", "small", ... "large"], - ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) >>> df - A B C D - 0 foo one small 1 - 1 foo one large 2 - 2 foo one large 2 - 3 foo two small 3 - 4 foo two small 3 - 5 bar one large 4 - 6 bar one small 5 - 7 bar two small 6 - 8 bar two large 7 + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table C large small A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 + bar one 4 5 + two 7 6 + foo one 4 1 + two NaN 6 + + We can also fill missing values using the `fill_value` parameter. >>> table = pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc=np.sum) + ... columns=['C'], aggfunc=np.sum, fill_value=0) >>> table C large small A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': np.mean, + ... 'E': np.mean}) + >>> table + D E + mean mean + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], ... aggfunc={'D': np.mean, ... 'E': [min, max, np.mean]}) >>> table D E - mean max median min + mean max mean min A C - bar large 5.500000 16 14.5 13 - small 5.500000 15 14.5 14 - foo large 2.000000 10 9.5 9 - small 2.333333 12 11.0 8 + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 Returns ------- @@ -6694,6 +6720,15 @@ def round(self, decimals=0, *args, **kwargs): of `decimals` which are not columns of the input will be ignored. + Returns + ------- + DataFrame + + See Also + -------- + numpy.around + Series.round + Examples -------- >>> df = pd.DataFrame(np.random.random([3, 3]), @@ -6719,15 +6754,6 @@ def round(self, decimals=0, *args, **kwargs): first 0.0 1 0.17 second 0.0 1 0.58 third 0.9 0 0.49 - - Returns - ------- - DataFrame object - - See Also - -------- - numpy.around - Series.round """ from pandas.core.reshape.concat import concat @@ -6793,7 +6819,6 @@ def corr(self, method='pearson', min_periods=1): Examples -------- - >>> import numpy as np >>> histogram_intersection = lambda a, b: np.minimum(a, b ... ).sum().round(decimals=1) >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cfdc6b34274bf..34f25c5634d5b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5248,11 +5248,11 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): the same type. Alternatively, use {col: dtype, ...}, where col is a column label and dtype is a numpy.dtype or Python type to cast one or more of the DataFrame's columns to column-specific types. - copy : bool, default True. + copy : bool, default True Return a copy when ``copy=True`` (be very careful setting ``copy=False`` as changes to values then may propagate to other pandas objects). - errors : {'raise', 'ignore'}, default 'raise'. + errors : {'raise', 'ignore'}, default 'raise' Control raising of exceptions on invalid data for provided dtype. - ``raise`` : allow exceptions to be raised @@ -9691,8 +9691,7 @@ def nanptp(values, axis=0, skipna=True): cls.ptp = _make_stat_function( cls, 'ptp', name, name2, axis_descr, - """ - Returns the difference between the maximum value and the + """Returns the difference between the maximum value and the minimum value in the object. This is the equivalent of the ``numpy.ndarray`` method ``ptp``. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8470bc6fec490..263de57d32f31 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -301,11 +301,19 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, (dtype is not None and is_datetime64_any_dtype(dtype)) or 'tz' in kwargs): from pandas import DatetimeIndex - result = DatetimeIndex(data, copy=copy, name=name, - dtype=dtype, **kwargs) + if dtype is not None and is_dtype_equal(_o_dtype, dtype): - return Index(result.to_pydatetime(), dtype=_o_dtype) + # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, + # will raise in the where `data` is already tz-aware. So + # we leave it out of this step and cast to object-dtype after + # the DatetimeIndex construction. + # Note we can pass copy=False because the .astype below + # will always make a copy + result = DatetimeIndex(data, copy=False, name=name, **kwargs) + return result.astype(object) else: + result = DatetimeIndex(data, copy=copy, name=name, + dtype=dtype, **kwargs) return result elif (is_timedelta64_dtype(data) or diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 59429488a7c2f..1d9d3b1d3bd16 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -4,44 +4,32 @@ """ import warnings -from pandas import compat -from pandas.compat.numpy import function as nv -from pandas.core.tools.timedeltas import to_timedelta - import numpy as np -from pandas._libs import lib, iNaT, NaT -from pandas._libs.tslibs.timestamps import round_nsint64, RoundTo +from pandas._libs import NaT, iNaT, lib +from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 +import pandas.compat as compat +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - ensure_int64, - is_dtype_equal, - is_float, - is_integer, - is_list_like, - is_scalar, - is_bool_dtype, - is_period_dtype, - is_categorical_dtype, - is_datetime_or_timedelta_dtype, - is_float_dtype, - is_integer_dtype, - is_object_dtype, - is_string_dtype) -from pandas.core.dtypes.generic import ( - ABCIndex, ABCSeries, ABCIndexClass) + ensure_int64, is_bool_dtype, is_categorical_dtype, + is_datetime_or_timedelta_dtype, is_dtype_equal, is_float, is_float_dtype, + is_integer, is_integer_dtype, is_list_like, is_object_dtype, + is_period_dtype, is_scalar, is_string_dtype) +import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna -from pandas.core import common as com, algorithms, ops - -import pandas.io.formats.printing as printing +from pandas.core import algorithms, common as com, ops from pandas.core.arrays import PeriodArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs -from pandas.util._decorators import Appender, cache_readonly -import pandas.core.dtypes.concat as _concat +from pandas.core.tools.timedeltas import to_timedelta + +import pandas.io.formats.printing as printing -import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 56ab9b6c020c0..b754b2705d034 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1,57 +1,45 @@ # pylint: disable=E1101 from __future__ import division + +from datetime import datetime, time, timedelta import operator import warnings -from datetime import time, datetime, timedelta import numpy as np from pytz import utc -from pandas.core.base import _shared_docs +from pandas._libs import ( + Timestamp, index as libindex, join as libjoin, lib, tslib as libts) +from pandas._libs.tslibs import ( + ccalendar, conversion, fields, parsing, timezones) +import pandas.compat as compat +from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( - _INT64_DTYPE, - _NS_DTYPE, - is_datetime64_dtype, - is_datetimetz, - is_dtype_equal, - is_integer, - is_float, - is_integer_dtype, - is_datetime64_ns_dtype, - is_period_dtype, - is_string_like, - is_list_like, - is_scalar, - pandas_dtype, - ensure_int64) + _INT64_DTYPE, _NS_DTYPE, ensure_int64, is_datetime64_dtype, + is_datetime64_ns_dtype, is_datetimetz, is_dtype_equal, is_float, + is_integer, is_integer_dtype, is_list_like, is_period_dtype, is_scalar, + is_string_like, pandas_dtype) +import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna -import pandas.core.dtypes.concat as _concat -from pandas.core.arrays.datetimes import DatetimeArrayMixin, _to_m8 from pandas.core.arrays import datetimelike as dtl - +from pandas.core.arrays.datetimes import ( + DatetimeArrayMixin as DatetimeArray, _to_m8) +from pandas.core.base import _shared_docs +import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.datetimelike import ( + DatelikeOps, DatetimeIndexOpsMixin, TimelikeOps, wrap_array_method, + wrap_field_accessor) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name -import pandas.compat as compat -from pandas.tseries.frequencies import to_offset, Resolution -from pandas.core.indexes.datetimelike import ( - DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, - wrap_field_accessor, wrap_array_method) -from pandas.tseries.offsets import ( - CDay, prefix_mapping) - -from pandas.util._decorators import Appender, cache_readonly, Substitution -import pandas.core.common as com -import pandas.tseries.offsets as offsets import pandas.core.tools.datetimes as tools -from pandas._libs import (lib, index as libindex, tslib as libts, - join as libjoin, Timestamp) -from pandas._libs.tslibs import (timezones, conversion, fields, parsing, - ccalendar) +from pandas.tseries import offsets +from pandas.tseries.frequencies import Resolution, to_offset +from pandas.tseries.offsets import CDay, prefix_mapping def _new_DatetimeIndex(cls, d): @@ -68,7 +56,7 @@ def _new_DatetimeIndex(cls, d): return result -class DatetimeIndex(DatetimeArrayMixin, DatelikeOps, TimelikeOps, +class DatetimeIndex(DatetimeArray, DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray of datetime64 data, represented internally as int64, and @@ -182,8 +170,6 @@ class DatetimeIndex(DatetimeArrayMixin, DatelikeOps, TimelikeOps, pandas.to_datetime : Convert argument to datetime """ - _resolution = cache_readonly(DatetimeArrayMixin._resolution.fget) - _typ = 'datetimeindex' _join_precedence = 10 @@ -227,8 +213,6 @@ def _join_i8_wrapper(joinf, **kwargs): _is_numeric_dtype = False _infer_as_myclass = True - _timezone = cache_readonly(DatetimeArrayMixin._timezone.fget) - is_normalized = cache_readonly(DatetimeArrayMixin.is_normalized.fget) # -------------------------------------------------------------------- # Constructors @@ -239,6 +223,21 @@ def __new__(cls, data=None, dayfirst=False, yearfirst=False, dtype=None, copy=False, name=None, verify_integrity=True): + if data is None: + # TODO: Remove this block and associated kwargs; GH#20535 + result = cls._generate_range(start, end, periods, + freq=freq, tz=tz, normalize=normalize, + closed=closed, ambiguous=ambiguous) + result.name = name + return result + + if is_scalar(data): + raise TypeError("{cls}() must be called with a " + "collection of some kind, {data} was passed" + .format(cls=cls.__name__, data=repr(data))) + + # - Cases checked above all return/raise before reaching here - # + # This allows to later ensure that the 'copy' parameter is honored: if isinstance(data, Index): ref_to_data = data._data @@ -253,20 +252,7 @@ def __new__(cls, data=None, # if dtype has an embedded tz, capture it tz = dtl.validate_tz_from_dtype(dtype, tz) - if data is None: - # TODO: Remove this block and associated kwargs; GH#20535 - result = cls._generate_range(start, end, periods, - freq=freq, tz=tz, normalize=normalize, - closed=closed, ambiguous=ambiguous) - result.name = name - return result - - if not isinstance(data, (np.ndarray, Index, ABCSeries, - DatetimeArrayMixin)): - if is_scalar(data): - raise ValueError('DatetimeIndex() must be called with a ' - 'collection of some kind, %s was passed' - % repr(data)) + if not isinstance(data, (np.ndarray, Index, ABCSeries, DatetimeArray)): # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) @@ -280,7 +266,7 @@ def __new__(cls, data=None, data = tools.to_datetime(data, dayfirst=dayfirst, yearfirst=yearfirst) - if isinstance(data, DatetimeArrayMixin): + if isinstance(data, DatetimeArray): if tz is None: tz = data.tz elif data.tz is None: @@ -328,9 +314,7 @@ def __new__(cls, data=None, cls._validate_frequency(subarr, freq, ambiguous=ambiguous) if freq_infer: - inferred = subarr.inferred_freq - if inferred: - subarr.freq = to_offset(inferred) + subarr.freq = to_offset(subarr.inferred_freq) return subarr._deepcopy_if_needed(ref_to_data, copy) @@ -501,7 +485,7 @@ def to_series(self, keep_tz=False, index=None, name=None): Parameters ---------- - keep_tz : optional, defaults False. + keep_tz : optional, defaults False return the data keeping the timezone. If keep_tz is True: @@ -1124,43 +1108,47 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- # Wrapping DatetimeArray - year = wrap_field_accessor(DatetimeArrayMixin.year) - month = wrap_field_accessor(DatetimeArrayMixin.month) - day = wrap_field_accessor(DatetimeArrayMixin.day) - hour = wrap_field_accessor(DatetimeArrayMixin.hour) - minute = wrap_field_accessor(DatetimeArrayMixin.minute) - second = wrap_field_accessor(DatetimeArrayMixin.second) - microsecond = wrap_field_accessor(DatetimeArrayMixin.microsecond) - nanosecond = wrap_field_accessor(DatetimeArrayMixin.nanosecond) - weekofyear = wrap_field_accessor(DatetimeArrayMixin.weekofyear) + _timezone = cache_readonly(DatetimeArray._timezone.fget) + is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) + _resolution = cache_readonly(DatetimeArray._resolution.fget) + + year = wrap_field_accessor(DatetimeArray.year) + month = wrap_field_accessor(DatetimeArray.month) + day = wrap_field_accessor(DatetimeArray.day) + hour = wrap_field_accessor(DatetimeArray.hour) + minute = wrap_field_accessor(DatetimeArray.minute) + second = wrap_field_accessor(DatetimeArray.second) + microsecond = wrap_field_accessor(DatetimeArray.microsecond) + nanosecond = wrap_field_accessor(DatetimeArray.nanosecond) + weekofyear = wrap_field_accessor(DatetimeArray.weekofyear) week = weekofyear - dayofweek = wrap_field_accessor(DatetimeArrayMixin.dayofweek) + dayofweek = wrap_field_accessor(DatetimeArray.dayofweek) weekday = dayofweek - weekday_name = wrap_field_accessor(DatetimeArrayMixin.weekday_name) + weekday_name = wrap_field_accessor(DatetimeArray.weekday_name) - dayofyear = wrap_field_accessor(DatetimeArrayMixin.dayofyear) - quarter = wrap_field_accessor(DatetimeArrayMixin.quarter) - days_in_month = wrap_field_accessor(DatetimeArrayMixin.days_in_month) + dayofyear = wrap_field_accessor(DatetimeArray.dayofyear) + quarter = wrap_field_accessor(DatetimeArray.quarter) + days_in_month = wrap_field_accessor(DatetimeArray.days_in_month) daysinmonth = days_in_month - is_month_start = wrap_field_accessor(DatetimeArrayMixin.is_month_start) - is_month_end = wrap_field_accessor(DatetimeArrayMixin.is_month_end) - is_quarter_start = wrap_field_accessor(DatetimeArrayMixin.is_quarter_start) - is_quarter_end = wrap_field_accessor(DatetimeArrayMixin.is_quarter_end) - is_year_start = wrap_field_accessor(DatetimeArrayMixin.is_year_start) - is_year_end = wrap_field_accessor(DatetimeArrayMixin.is_year_end) - is_leap_year = wrap_field_accessor(DatetimeArrayMixin.is_leap_year) - - tz_localize = wrap_array_method(DatetimeArrayMixin.tz_localize, True) - tz_convert = wrap_array_method(DatetimeArrayMixin.tz_convert, True) - to_perioddelta = wrap_array_method(DatetimeArrayMixin.to_perioddelta, + is_month_start = wrap_field_accessor(DatetimeArray.is_month_start) + is_month_end = wrap_field_accessor(DatetimeArray.is_month_end) + is_quarter_start = wrap_field_accessor(DatetimeArray.is_quarter_start) + is_quarter_end = wrap_field_accessor(DatetimeArray.is_quarter_end) + is_year_start = wrap_field_accessor(DatetimeArray.is_year_start) + is_year_end = wrap_field_accessor(DatetimeArray.is_year_end) + is_leap_year = wrap_field_accessor(DatetimeArray.is_leap_year) + + tz_localize = wrap_array_method(DatetimeArray.tz_localize, True) + tz_convert = wrap_array_method(DatetimeArray.tz_convert, True) + to_perioddelta = wrap_array_method(DatetimeArray.to_perioddelta, False) - to_period = wrap_array_method(DatetimeArrayMixin.to_period, True) - normalize = wrap_array_method(DatetimeArrayMixin.normalize, True) - to_julian_date = wrap_array_method(DatetimeArrayMixin.to_julian_date, + to_period = wrap_array_method(DatetimeArray.to_period, True) + normalize = wrap_array_method(DatetimeArray.normalize, True) + to_julian_date = wrap_array_method(DatetimeArray.to_julian_date, False) - month_name = wrap_array_method(DatetimeArrayMixin.month_name, True) - day_name = wrap_array_method(DatetimeArrayMixin.day_name, True) + month_name = wrap_array_method(DatetimeArray.month_name, True) + day_name = wrap_array_method(DatetimeArray.day_name, True) # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9c981c24190a4..01304cce507f0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1885,7 +1885,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): ascending : boolean, default True False to sort in descending order Can also be a list to specify a directed ordering - sort_remaining : sort by the remaining levels after level. + sort_remaining : sort by the remaining levels after level Returns ------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 128068959ebd3..7890f03a1eba7 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -256,8 +256,12 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): result._reset_identity() return result + # ------------------------------------------------------------------------ + # Wrapping PeriodArray + # ------------------------------------------------------------------------ # Data + @property def _ndarray_values(self): return self._data._ndarray_values @@ -361,13 +365,6 @@ def asfreq(self, freq=None, how='E'): result = self._data.asfreq(freq=freq, how=how) return self._simple_new(result, name=self.name) - def _nat_new(self, box=True): - # TODO(DatetimeArray): remove this - result = self._data._nat_new(box=box) - if box: - result = self._simple_new(result, name=self.name) - return result - def to_timestamp(self, freq=None, how='start'): from pandas import DatetimeIndex result = self._data.to_timestamp(freq=freq, how=how) @@ -425,6 +422,7 @@ def _maybe_convert_timedelta(self, other): # ------------------------------------------------------------------------ # Indexing + @cache_readonly def _engine(self): return self._engine_type(lambda: self, len(self)) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d1b5645928921..e4c177a08462e 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -38,7 +38,7 @@ class RangeIndex(Int64Index): Parameters ---------- - start : int (default: 0), or other RangeIndex instance. + start : int (default: 0), or other RangeIndex instance If int and "stop" is not given, interpreted as "stop" instead. stop : int (default: 0) step : int (default: 1) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5b077a6984114..d9625d38b85de 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -2,44 +2,36 @@ from datetime import datetime import numpy as np + +from pandas._libs import ( + NaT, Timedelta, index as libindex, join as libjoin, lib) +import pandas.compat as compat +from pandas.util._decorators import Appender, Substitution + from pandas.core.dtypes.common import ( - _TD_DTYPE, - is_integer, - is_float, - is_list_like, - is_scalar, - is_timedelta64_dtype, - is_timedelta64_ns_dtype, - pandas_dtype, - ensure_int64) + _TD_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar, + is_timedelta64_dtype, is_timedelta64_ns_dtype, pandas_dtype) +import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna -from pandas.core.arrays.timedeltas import ( - TimedeltaArrayMixin, _is_convertible_to_td, _to_m8) from pandas.core.arrays import datetimelike as dtl - -from pandas.core.indexes.base import Index -from pandas.core.indexes.numeric import Int64Index -import pandas.compat as compat - -from pandas.tseries.frequencies import to_offset +from pandas.core.arrays.timedeltas import ( + TimedeltaArrayMixin as TimedeltaArray, _is_convertible_to_td, _to_m8, + sequence_to_td64ns) from pandas.core.base import _shared_docs -from pandas.core.indexes.base import _index_shared_docs import pandas.core.common as com -from pandas.core.ops import get_op_result_name -import pandas.core.dtypes.concat as _concat -from pandas.util._decorators import Appender, Substitution +from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.datetimelike import ( - TimelikeOps, DatetimeIndexOpsMixin, wrap_arithmetic_op, - wrap_array_method, wrap_field_accessor) -from pandas.core.tools.timedeltas import ( - to_timedelta, _coerce_scalar_to_timedelta_type) -from pandas._libs import (lib, index as libindex, - join as libjoin, Timedelta, NaT) -from pandas._libs.tslibs.timedeltas import array_to_timedelta64 + DatetimeIndexOpsMixin, TimelikeOps, wrap_arithmetic_op, wrap_array_method, + wrap_field_accessor) +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name +from pandas.core.tools.timedeltas import _coerce_scalar_to_timedelta_type + +from pandas.tseries.frequencies import to_offset -class TimedeltaIndex(TimedeltaArrayMixin, DatetimeIndexOpsMixin, +class TimedeltaIndex(TimedeltaArray, DatetimeIndexOpsMixin, TimelikeOps, Int64Index): """ Immutable ndarray of timedelta64 data, represented internally as int64, and @@ -139,12 +131,6 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, periods=None, closed=None, dtype=None, copy=False, name=None, verify_integrity=True): - if isinstance(data, TimedeltaIndex) and freq is None and name is None: - if copy: - return data.copy() - else: - return data._shallow_copy() - freq, freq_infer = dtl.maybe_infer_freq(freq) if data is None: @@ -154,32 +140,31 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, result.name = name return result - if unit is not None: - data = to_timedelta(data, unit=unit, box=False) - if is_scalar(data): - raise ValueError('TimedeltaIndex() must be called with a ' - 'collection of some kind, {data} was passed' - .format(data=repr(data))) - - # convert if not already - if getattr(data, 'dtype', None) != _TD_DTYPE: - data = to_timedelta(data, unit=unit, box=False) - elif copy: - data = np.array(data, copy=True) - - data = np.array(data, copy=False) - if data.dtype == np.object_: - data = array_to_timedelta64(data) - if data.dtype != _TD_DTYPE: - if is_timedelta64_dtype(data): - # non-nano unit - # TODO: watch out for overflows - data = data.astype(_TD_DTYPE) + raise TypeError('{cls}() must be called with a ' + 'collection of some kind, {data} was passed' + .format(cls=cls.__name__, data=repr(data))) + + if isinstance(data, TimedeltaIndex) and freq is None and name is None: + if copy: + return data.copy() else: - data = ensure_int64(data).view(_TD_DTYPE) + return data._shallow_copy() - assert data.dtype == 'm8[ns]', data.dtype + # - Cases checked above all return/raise before reaching here - # + + data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) + if inferred_freq is not None: + if freq is not None and freq != inferred_freq: + raise ValueError('Inferred frequency {inferred} from passed ' + 'values does not conform to passed frequency ' + '{passed}' + .format(inferred=inferred_freq, + passed=freq.freqstr)) + elif freq_infer: + freq = inferred_freq + freq_infer = False + verify_integrity = False subarr = cls._simple_new(data, name=name, freq=freq) # check that we are matching freqs @@ -188,9 +173,7 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, cls._validate_frequency(subarr, freq) if freq_infer: - inferred = subarr.inferred_freq - if inferred: - subarr.freq = to_offset(inferred) + subarr.freq = to_offset(subarr.inferred_freq) return subarr @@ -232,8 +215,7 @@ def _maybe_update_attributes(self, attrs): return attrs def _evaluate_with_timedelta_like(self, other, op): - result = TimedeltaArrayMixin._evaluate_with_timedelta_like(self, other, - op) + result = TimedeltaArray._evaluate_with_timedelta_like(self, other, op) return wrap_arithmetic_op(self, other, result) def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): @@ -245,12 +227,12 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): # ------------------------------------------------------------------- # Wrapping TimedeltaArray - days = wrap_field_accessor(TimedeltaArrayMixin.days) - seconds = wrap_field_accessor(TimedeltaArrayMixin.seconds) - microseconds = wrap_field_accessor(TimedeltaArrayMixin.microseconds) - nanoseconds = wrap_field_accessor(TimedeltaArrayMixin.nanoseconds) + days = wrap_field_accessor(TimedeltaArray.days) + seconds = wrap_field_accessor(TimedeltaArray.seconds) + microseconds = wrap_field_accessor(TimedeltaArray.microseconds) + nanoseconds = wrap_field_accessor(TimedeltaArray.nanoseconds) - total_seconds = wrap_array_method(TimedeltaArrayMixin.total_seconds, True) + total_seconds = wrap_array_method(TimedeltaArray.total_seconds, True) # ------------------------------------------------------------------- diff --git a/pandas/core/panel.py b/pandas/core/panel.py index eb841e6398976..c878d16fac2e9 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -106,14 +106,14 @@ def panel_index(time, panels, names=None): class Panel(NDFrame): """ - Represents wide format panel data, stored as 3-dimensional array - - .. deprecated:: 0.20.0 - The recommended way to represent 3-D data are with a MultiIndex on a - DataFrame via the :attr:`~Panel.to_frame()` method or with the - `xarray package `__. - Pandas provides a :attr:`~Panel.to_xarray()` method to automate this - conversion. + Represents wide format panel data, stored as 3-dimensional array. + + .. deprecated:: 0.20.0 + The recommended way to represent 3-D data are with a MultiIndex on a + DataFrame via the :attr:`~Panel.to_frame()` method or with the + `xarray package `__. + Pandas provides a :attr:`~Panel.to_xarray()` method to automate this + conversion. Parameters ---------- diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ec4cdffc56435..d12dbb81765d8 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -407,12 +407,12 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. - aggfunc : function, optional - If specified, requires `values` be specified as well rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed + aggfunc : function, optional + If specified, requires `values` be specified as well margins : boolean, default False Add row/column margins (subtotals) margins_name : string, default 'All' diff --git a/pandas/core/series.py b/pandas/core/series.py index 6971b0b0c78e0..20e4720a3bde7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1913,7 +1913,6 @@ def corr(self, other, method='pearson', min_periods=None): Examples -------- - >>> import numpy as np >>> histogram_intersection = lambda a, b: np.minimum(a, b ... ).sum().round(decimals=1) >>> s1 = pd.Series([.2, .0, .6, .2]) @@ -3025,7 +3024,7 @@ def reorder_levels(self, order): Parameters ---------- - order : list of int representing new level order. + order : list of int representing new level order (reference level by number or key) Returns diff --git a/pandas/core/strings.py b/pandas/core/strings.py index bf0c93437f4dc..a12605aaed554 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -659,7 +659,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): If True, case sensitive flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE - na : default NaN, fill value for missing values. + na : default NaN, fill value for missing values Returns ------- @@ -2665,7 +2665,7 @@ def encode(self, encoding, errors="strict"): Parameters ---------- - to_strip : str or None, default None. + to_strip : str or None, default None Specifying the set of characters to be removed. All combinations of this set of characters will be stripped. If None then whitespaces are removed. diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 220b14a9cb7c6..fad136b3b5a45 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -6,16 +6,13 @@ import pandas as pd from pandas._libs import tslibs from pandas._libs.tslibs.timedeltas import (convert_to_timedelta64, - array_to_timedelta64, parse_timedelta_unit) -from pandas.core.dtypes.common import ( - ensure_object, - is_integer_dtype, - is_timedelta64_dtype, - is_list_like) +from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass +from pandas.core.arrays.timedeltas import sequence_to_td64ns + def to_timedelta(arg, unit='ns', box=True, errors='raise'): """ @@ -129,31 +126,27 @@ def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): """Convert a list of objects to a timedelta index object.""" if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): - arg = np.array(list(arg), dtype='O') - - # these are shortcut-able - if is_timedelta64_dtype(arg): - value = arg.astype('timedelta64[ns]') - elif is_integer_dtype(arg): - value = arg.astype('timedelta64[{unit}]'.format(unit=unit)).astype( - 'timedelta64[ns]', copy=False) - else: - try: - value = array_to_timedelta64(ensure_object(arg), - unit=unit, errors=errors) - value = value.astype('timedelta64[ns]', copy=False) - except ValueError: - if errors == 'ignore': - return arg - else: - # This else-block accounts for the cases when errors='raise' - # and errors='coerce'. If errors == 'raise', these errors - # should be raised. If errors == 'coerce', we shouldn't - # expect any errors to be raised, since all parsing errors - # cause coercion to pd.NaT. However, if an error / bug is - # introduced that causes an Exception to be raised, we would - # like to surface it. - raise + # This is needed only to ensure that in the case where we end up + # returning arg (errors == "ignore"), and where the input is a + # generator, we return a useful list-like instead of a + # used-up generator + arg = np.array(list(arg), dtype=object) + + try: + value = sequence_to_td64ns(arg, unit=unit, + errors=errors, copy=False)[0] + except ValueError: + if errors == 'ignore': + return arg + else: + # This else-block accounts for the cases when errors='raise' + # and errors='coerce'. If errors == 'raise', these errors + # should be raised. If errors == 'coerce', we shouldn't + # expect any errors to be raised, since all parsing errors + # cause coercion to pd.NaT. However, if an error / bug is + # introduced that causes an Exception to be raised, we would + # like to surface it. + raise if box: from pandas import TimedeltaIndex diff --git a/pandas/core/window.py b/pandas/core/window.py index 5256532a31870..be28a3bcccec6 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -136,7 +136,7 @@ def _gotitem(self, key, ndim, subset=None): Parameters ---------- - key : string / list of selections + key : str / list of selections ndim : 1,2 requested ndim of result subset : object, default None @@ -464,15 +464,16 @@ class Window(_Window): (otherwise result is NA). For a window that is specified by an offset, `min_periods` will default to 1. Otherwise, `min_periods` will default to the size of the window. - center : boolean, default False + center : bool, default False Set the labels at the center of the window. - win_type : string, default None + win_type : str, default None Provide a window type. If ``None``, all points are evenly weighted. See the notes below for further information. - on : string, optional + on : str, optional For a DataFrame, column on which to calculate the rolling window, rather than the index - closed : string, default None + axis : int or str, default 0 + closed : str, default None Make the interval closed on the 'right', 'left', 'both' or 'neither' endpoints. For offset-based windows, it defaults to 'right'. @@ -481,8 +482,6 @@ class Window(_Window): .. versionadded:: 0.20.0 - axis : int or string, default 0 - Returns ------- a Window or Rolling sub-classed for the particular operation @@ -661,7 +660,7 @@ def _apply_window(self, mean=True, **kwargs): Parameters ---------- - mean : boolean, default True + mean : bool, default True If True computes weighted mean, else weighted sum Returns @@ -819,11 +818,11 @@ def _apply(self, func, name=None, window=None, center=None, Parameters ---------- - func : string/callable to apply - name : string, optional + func : str/callable to apply + name : str, optional name of this function window : int/array, default to _get_window() - center : boolean, default to self.center + center : bool, default to self.center check_minp : function, default to _use_window Returns @@ -1816,9 +1815,9 @@ class Expanding(_Rolling_and_Expanding): min_periods : int, default 1 Minimum number of observations in window required to have a value (otherwise result is NA). - center : boolean, default False + center : bool, default False Set the labels at the center of the window. - axis : int or string, default 0 + axis : int or str, default 0 Returns ------- @@ -2062,7 +2061,7 @@ def _constructor(self): Parameters ---------- -bias : boolean, default False +bias : bool, default False Use a standard estimation bias correction """ @@ -2079,7 +2078,7 @@ def _constructor(self): will be a MultiIndex DataFrame in the case of DataFrame inputs. In the case of missing elements, only complete pairwise observations will be used. -bias : boolean, default False +bias : bool, default False Use a standard estimation bias correction """ @@ -2110,10 +2109,10 @@ class EWM(_Rolling): min_periods : int, default 0 Minimum number of observations in window required to have a value (otherwise result is NA). - adjust : boolean, default True + adjust : bool, default True Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings (viewing EWMA as a moving average) - ignore_na : boolean, default False + ignore_na : bool, default False Ignore missing values when calculating weights; specify True to reproduce pre-0.15.0 behavior @@ -2242,7 +2241,7 @@ def _apply(self, func, **kwargs): Parameters ---------- - func : string/callable to apply + func : str/callable to apply Returns ------- diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index c6108f30a560a..23a2b04214e4e 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -16,7 +16,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover Parameters ---------- - sep : str, default '\s+'. + sep : str, default '\s+' A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 7a7b801f4ba4a..2e93c237bb7ea 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -17,8 +17,7 @@ import pandas._libs.json as json import pandas.compat as compat from pandas.compat import ( - OrderedDict, add_metaclass, lrange, map, range, reduce, string_types, u, - zip) + OrderedDict, add_metaclass, lrange, map, range, string_types, u, zip) from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_kwarg @@ -93,13 +92,22 @@ .. deprecated:: 0.21.0 Pass in `usecols` instead. -usecols : int or list, default None - * If None then parse all columns, - * If int then indicates last column to be parsed - * If list of ints then indicates list of column numbers to be parsed - * If string then indicates comma separated list of Excel column letters and - column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of +usecols : int, str, list-like, or callable default None + * If None, then parse all columns, + * If int, then indicates last column to be parsed + * If string, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of both sides. + * If list of ints, then indicates list of column numbers to be parsed. + * If list of strings, then indicates list of column names to be parsed. + + .. versionadded:: 0.24.0 + + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + .. versionadded:: 0.24.0 + squeeze : boolean, default False If the parsed data only contains one column then return a Series dtype : Type name or dict of column -> type, default None @@ -466,39 +474,6 @@ def parse(self, convert_float=convert_float, **kwds) - def _should_parse(self, i, usecols): - - def _range2cols(areas): - """ - Convert comma separated list of column names and column ranges to a - list of 0-based column indexes. - - >>> _range2cols('A:E') - [0, 1, 2, 3, 4] - >>> _range2cols('A,C,Z:AB') - [0, 2, 25, 26, 27] - """ - def _excel2num(x): - "Convert Excel column name like 'AB' to 0-based column index" - return reduce(lambda s, a: s * 26 + ord(a) - ord('A') + 1, - x.upper().strip(), 0) - 1 - - cols = [] - for rng in areas.split(','): - if ':' in rng: - rng = rng.split(':') - cols += lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1) - else: - cols.append(_excel2num(rng)) - return cols - - if isinstance(usecols, int): - return i <= usecols - elif isinstance(usecols, compat.string_types): - return i in _range2cols(usecols) - else: - return i in usecols - def _parse_excel(self, sheet_name=0, header=0, @@ -527,10 +502,6 @@ def _parse_excel(self, raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") - if parse_dates is True and index_col is None: - warnings.warn("The 'parse_dates=True' keyword of read_excel was " - "provided without an 'index_col' keyword value.") - import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, @@ -620,17 +591,13 @@ def _parse_cell(cell_contents, cell_typ): sheet = self.book.sheet_by_index(asheetname) data = [] - should_parse = {} + usecols = _maybe_convert_usecols(usecols) for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): - if usecols is not None and j not in should_parse: - should_parse[j] = self._should_parse(j, usecols) - - if usecols is None or should_parse[j]: - row.append(_parse_cell(value, typ)) + row.append(_parse_cell(value, typ)) data.append(row) if sheet.nrows == 0: @@ -642,24 +609,22 @@ def _parse_cell(cell_contents, cell_typ): # forward fill and pull out names for MultiIndex column header_names = None - if header is not None: - if is_list_like(header): - header_names = [] - control_row = [True] * len(data[0]) - for row in header: - if is_integer(skiprows): - row += skiprows - - data[row], control_row = _fill_mi_header( - data[row], control_row) - header_name, data[row] = _pop_header_name( - data[row], index_col) - header_names.append(header_name) - else: - data[header] = _trim_excel_header(data[header]) + if header is not None and is_list_like(header): + header_names = [] + control_row = [True] * len(data[0]) + + for row in header: + if is_integer(skiprows): + row += skiprows + + data[row], control_row = _fill_mi_header( + data[row], control_row) + header_name, _ = _pop_header_name( + data[row], index_col) + header_names.append(header_name) if is_list_like(index_col): - # forward fill values for MultiIndex index + # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: @@ -667,6 +632,7 @@ def _parse_cell(cell_contents, cell_typ): for col in index_col: last = data[offset][col] + for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last @@ -693,11 +659,14 @@ def _parse_cell(cell_contents, cell_typ): thousands=thousands, comment=comment, skipfooter=skipfooter, + usecols=usecols, **kwds) output[asheetname] = parser.read(nrows=nrows) + if names is not None: output[asheetname].columns = names + if not squeeze or isinstance(output[asheetname], DataFrame): output[asheetname].columns = output[ asheetname].columns.set_names(header_names) @@ -726,6 +695,97 @@ def __exit__(self, exc_type, exc_value, traceback): self.close() +def _excel2num(x): + """ + Convert Excel column name like 'AB' to 0-based column index. + + Parameters + ---------- + x : str + The Excel column name to convert to a 0-based column index. + + Returns + ------- + num : int + The column index corresponding to the name. + + Raises + ------ + ValueError + Part of the Excel column name was invalid. + """ + index = 0 + + for c in x.upper().strip(): + cp = ord(c) + + if cp < ord("A") or cp > ord("Z"): + raise ValueError("Invalid column name: {x}".format(x=x)) + + index = index * 26 + cp - ord("A") + 1 + + return index - 1 + + +def _range2cols(areas): + """ + Convert comma separated list of column names and ranges to indices. + + Parameters + ---------- + areas : str + A string containing a sequence of column ranges (or areas). + + Returns + ------- + cols : list + A list of 0-based column indices. + + Examples + -------- + >>> _range2cols('A:E') + [0, 1, 2, 3, 4] + >>> _range2cols('A,C,Z:AB') + [0, 2, 25, 26, 27] + """ + cols = [] + + for rng in areas.split(","): + if ":" in rng: + rng = rng.split(":") + cols.extend(lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1)) + else: + cols.append(_excel2num(rng)) + + return cols + + +def _maybe_convert_usecols(usecols): + """ + Convert `usecols` into a compatible format for parsing in `parsers.py`. + + Parameters + ---------- + usecols : object + The use-columns object to potentially convert. + + Returns + ------- + converted : object + The compatible format of `usecols`. + """ + if usecols is None: + return usecols + + if is_integer(usecols): + return lrange(usecols + 1) + + if isinstance(usecols, compat.string_types): + return _range2cols(usecols) + + return usecols + + def _validate_freeze_panes(freeze_panes): if freeze_panes is not None: if ( diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index ce07a795017e5..af046d9f309e7 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -110,10 +110,10 @@ def json_normalize(data, record_path=None, meta=None, assumed to be an array of records meta : list of paths (string or list of strings), default None Fields to use as metadata for each record in resulting table + meta_prefix : string, default None record_prefix : string, default None If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] - meta_prefix : string, default None errors : {'raise', 'ignore'}, default 'raise' * 'ignore' : will ignore KeyError if keys listed in meta are not diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 41e14e482d061..4c28e0f88b1ae 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1674,7 +1674,7 @@ def cvalues(self): def __iter__(self): return iter(self.values) - def maybe_set_size(self, min_itemsize=None, **kwargs): + def maybe_set_size(self, min_itemsize=None): """ maybe set a string col itemsize: min_itemsize can be an integer or a dict with this columns name with an integer size """ @@ -1687,13 +1687,13 @@ def maybe_set_size(self, min_itemsize=None, **kwargs): self.typ = _tables( ).StringCol(itemsize=min_itemsize, pos=self.pos) - def validate(self, handler, append, **kwargs): + def validate(self, handler, append): self.validate_names() def validate_names(self): pass - def validate_and_set(self, handler, append, **kwargs): + def validate_and_set(self, handler, append): self.set_table(handler.table) self.validate_col() self.validate_attr(append) @@ -3772,7 +3772,7 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return Index(coords) - def read_column(self, column, where=None, start=None, stop=None, **kwargs): + def read_column(self, column, where=None, start=None, stop=None): """return a single column from the table, generally only indexables are interesting """ @@ -4727,7 +4727,7 @@ class Selection(object): """ - def __init__(self, table, where=None, start=None, stop=None, **kwargs): + def __init__(self, table, where=None, start=None, stop=None): self.table = table self.where = where self.start = start diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index cf1abc6f79101..9ee5e05638978 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -158,19 +158,18 @@ def box_df_fail(request): return request.param -@pytest.fixture(params=[ - pd.Index, - pd.Series, - pytest.param(pd.DataFrame, - marks=pytest.mark.xfail(reason="Tries to broadcast " - "incorrectly", - strict=True, raises=ValueError)) -], ids=lambda x: x.__name__) -def box_df_broadcast_failure(request): - """ - Fixture equivalent to `box` but with the common failing case where - the DataFrame operation tries to broadcast incorrectly. +@pytest.fixture(params=[(pd.Index, False), + (pd.Series, False), + (pd.DataFrame, False), + pytest.param((pd.DataFrame, True), + marks=pytest.mark.xfail(strict=True))], + ids=lambda x: x[0].__name__ + '-' + str(x[1])) +def box_transpose_fail(request): + """ + Fixture similar to `box` but testing both transpose cases for DataFrame, + with the tranpose=True case xfailed. """ + # GH#23620 return request.param diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 73921a18ee5c7..b25e9a9a485c2 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1155,14 +1155,18 @@ def test_dti_add_intarray_no_freq(self, box): def test_dti_add_timedeltalike(self, tz_naive_fixture, two_hours, box_with_datetime): # GH#22005, GH#22163 check DataFrame doesn't raise TypeError + box = box_with_datetime + tz = tz_naive_fixture rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - rng = tm.box_expected(rng, box_with_datetime) + + # FIXME: calling with transpose=True raises ValueError + rng = tm.box_expected(rng, box, transpose=False) result = rng + two_hours expected = pd.date_range('2000-01-01 02:00', '2000-02-01 02:00', tz=tz) - expected = tm.box_expected(expected, box_with_datetime) + expected = tm.box_expected(expected, box, transpose=False) tm.assert_equal(result, expected) def test_dti_iadd_timedeltalike(self, tz_naive_fixture, two_hours): @@ -1192,12 +1196,15 @@ def test_dti_isub_timedeltalike(self, tz_naive_fixture, two_hours): def test_dt64arr_add_sub_td64_nat(self, box, tz_naive_fixture): # GH#23320 special handling for timedelta64("NaT") tz = tz_naive_fixture + dti = pd.date_range("1994-04-01", periods=9, tz=tz, freq="QS") other = np.timedelta64("NaT") expected = pd.DatetimeIndex(["NaT"] * 9, tz=tz) - obj = tm.box_expected(dti, box) - expected = tm.box_expected(expected, box) + # FIXME: fails with transpose=True due to tz-aware DataFrame + # transpose bug + obj = tm.box_expected(dti, box, transpose=False) + expected = tm.box_expected(expected, box, transpose=False) result = obj + other tm.assert_equal(result, expected) @@ -1450,10 +1457,8 @@ def test_sub_period(self, freq, box_with_datetime): operator.sub, ops.rsub]) @pytest.mark.parametrize('pi_freq', ['D', 'W', 'Q', 'H']) @pytest.mark.parametrize('dti_freq', [None, 'D']) - def test_dti_sub_pi(self, dti_freq, pi_freq, op, box_df_broadcast_failure): + def test_dti_sub_pi(self, dti_freq, pi_freq, op, box): # GH#20049 subtracting PeriodIndex should raise TypeError - box = box_df_broadcast_failure - dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=dti_freq) pi = dti.to_period(pi_freq) @@ -1782,6 +1787,8 @@ def test_dti_with_offset_series(self, tz_naive_fixture, names): def test_dti_add_offset_tzaware(self, tz_aware_fixture, box_with_datetime): # GH#21610, GH#22163 ensure DataFrame doesn't return object-dtype + box = box_with_datetime + timezone = tz_aware_fixture if timezone == 'US/Pacific': dates = date_range('2012-11-01', periods=3, tz=timezone) @@ -1793,8 +1800,9 @@ def test_dti_add_offset_tzaware(self, tz_aware_fixture, box_with_datetime): expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', '2010-11-01 07:00'], freq='H', tz=timezone) - dates = tm.box_expected(dates, box_with_datetime) - expected = tm.box_expected(expected, box_with_datetime) + # FIXME: these raise ValueError with transpose=True + dates = tm.box_expected(dates, box, transpose=False) + expected = tm.box_expected(expected, box, transpose=False) # TODO: parametrize over the scalar being added? radd? sub? offset = dates + pd.offsets.Hour(5) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 687d07082ea33..a26a11cb6be9e 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -70,9 +70,8 @@ def test_parr_cmp_period_scalar(self, freq, box): tm.assert_equal(per >= base, exp) @pytest.mark.parametrize('freq', ['M', '2M', '3M']) - def test_parr_cmp_pi(self, freq, box_df_fail): + def test_parr_cmp_pi(self, freq, box): # GH#13200 - box = box_df_fail xbox = np.ndarray if box is pd.Index else box base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], @@ -108,11 +107,9 @@ def test_parr_cmp_pi(self, freq, box_df_fail): tm.assert_equal(base <= idx, exp) @pytest.mark.parametrize('freq', ['M', '2M', '3M']) - def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_df_fail): + def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box): # GH#13200 # different base freq - box = box_df_fail - base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], freq=freq) base = tm.box_expected(base, box) @@ -302,9 +299,7 @@ class TestPeriodIndexArithmetic(object): # PeriodIndex - other is defined for integers, timedelta-like others, # and PeriodIndex (with matching freq) - def test_parr_add_iadd_parr_raises(self, box_df_broadcast_failure): - box = box_df_broadcast_failure - + def test_parr_add_iadd_parr_raises(self, box): rng = pd.period_range('1/1/2000', freq='D', periods=5) other = pd.period_range('1/6/2000', freq='D', periods=5) # TODO: parametrize over boxes for other? @@ -346,9 +341,7 @@ def test_pi_sub_pi_with_nat(self): expected = pd.Index([pd.NaT, 0 * off, 0 * off, 0 * off, 0 * off]) tm.assert_index_equal(result, expected) - def test_parr_sub_pi_mismatched_freq(self, box_df_broadcast_failure): - box = box_df_broadcast_failure - + def test_parr_sub_pi_mismatched_freq(self, box): rng = pd.period_range('1/1/2000', freq='D', periods=5) other = pd.period_range('1/6/2000', freq='H', periods=5) # TODO: parametrize over boxes for other? @@ -364,9 +357,6 @@ def test_parr_sub_pi_mismatched_freq(self, box_df_broadcast_failure): @pytest.mark.parametrize('op', [operator.add, ops.radd, operator.sub, ops.rsub]) def test_pi_add_sub_float(self, op, other, box): - if box is pd.DataFrame and isinstance(other, np.ndarray): - pytest.xfail(reason="Tries to broadcast incorrectly") - dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') pi = dti.to_period('D') pi = tm.box_expected(pi, box) @@ -563,15 +553,18 @@ def test_pi_sub_isub_offset(self): rng -= pd.offsets.MonthEnd(5) tm.assert_index_equal(rng, expected) - def test_pi_add_offset_n_gt1(self, box): + def test_pi_add_offset_n_gt1(self, box_transpose_fail): # GH#23215 # add offset to PeriodIndex with freq.n > 1 + box, transpose = box_transpose_fail + per = pd.Period('2016-01', freq='2M') pi = pd.PeriodIndex([per]) expected = pd.PeriodIndex(['2016-03'], freq='2M') - pi = tm.box_expected(pi, box) - expected = tm.box_expected(expected, box) + + pi = tm.box_expected(pi, box, transpose=transpose) + expected = tm.box_expected(expected, box, transpose=transpose) result = pi + per.freq tm.assert_equal(result, expected) @@ -582,12 +575,14 @@ def test_pi_add_offset_n_gt1(self, box): def test_pi_add_offset_n_gt1_not_divisible(self, box_with_period): # GH#23215 # PeriodIndex with freq.n > 1 add offset with offset.n % freq.n != 0 + box = box_with_period pi = pd.PeriodIndex(['2016-01'], freq='2M') - pi = tm.box_expected(pi, box_with_period) - expected = pd.PeriodIndex(['2016-04'], freq='2M') - expected = tm.box_expected(expected, box_with_period) + + # FIXME: with transposing these tests fail + pi = tm.box_expected(pi, box, transpose=False) + expected = tm.box_expected(expected, box, transpose=False) result = pi + to_offset('3M') tm.assert_equal(result, expected) @@ -801,14 +796,16 @@ def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, with pytest.raises(period.IncompatibleFrequency, match=msg): rng -= other - def test_parr_add_sub_td64_nat(self, box): + def test_parr_add_sub_td64_nat(self, box_transpose_fail): # GH#23320 special handling for timedelta64("NaT") + box, transpose = box_transpose_fail + pi = pd.period_range("1994-04-01", periods=9, freq="19D") other = np.timedelta64("NaT") expected = pd.PeriodIndex(["NaT"] * 9, freq="19D") - obj = tm.box_expected(pi, box) - expected = tm.box_expected(expected, box) + obj = tm.box_expected(pi, box, transpose=transpose) + expected = tm.box_expected(expected, box, transpose=transpose) result = obj + other tm.assert_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index f92a772f3eaad..58c8b3b07f723 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -367,10 +367,6 @@ def test_td64arr_add_str_invalid(self, box): operator.sub, ops.rsub], ids=lambda x: x.__name__) def test_td64arr_add_sub_float(self, box, op, other): - if box is pd.DataFrame and isinstance(other, np.ndarray): - pytest.xfail("Tries to broadcast, raising " - "ValueError instead of TypeError") - tdi = TimedeltaIndex(['-1 days', '-1 days']) tdi = tm.box_expected(tdi, box) @@ -393,9 +389,8 @@ def test_td64arr_sub_period(self, box, freq): @pytest.mark.parametrize('pi_freq', ['D', 'W', 'Q', 'H']) @pytest.mark.parametrize('tdi_freq', [None, 'H']) - def test_td64arr_sub_pi(self, box_df_broadcast_failure, tdi_freq, pi_freq): + def test_td64arr_sub_pi(self, box, tdi_freq, pi_freq): # GH#20049 subtracting PeriodIndex should raise TypeError - box = box_df_broadcast_failure tdi = TimedeltaIndex(['1 hours', '2 hours'], freq=tdi_freq) dti = Timestamp('2018-03-07 17:16:40') + tdi pi = dti.to_period(pi_freq) @@ -427,8 +422,10 @@ def test_td64arr_add_timestamp(self, box, tz_naive_fixture): idx = TimedeltaIndex(['1 day', '2 day']) expected = DatetimeIndex(['2011-01-02', '2011-01-03'], tz=tz) - idx = tm.box_expected(idx, box) - expected = tm.box_expected(expected, box) + # FIXME: fails with transpose=True because of tz-aware DataFrame + # transpose bug + idx = tm.box_expected(idx, box, transpose=False) + expected = tm.box_expected(expected, box, transpose=False) result = idx + other tm.assert_equal(result, expected) @@ -460,9 +457,7 @@ def test_td64arr_add_sub_timestamp(self, box): with pytest.raises(TypeError): tdser - ts - def test_tdi_sub_dt64_array(self, box_df_broadcast_failure): - box = box_df_broadcast_failure - + def test_tdi_sub_dt64_array(self, box): dti = pd.date_range('2016-01-01', periods=3) tdi = dti - dti.shift(1) dtarr = dti.values @@ -478,9 +473,7 @@ def test_tdi_sub_dt64_array(self, box_df_broadcast_failure): result = dtarr - tdi tm.assert_equal(result, expected) - def test_tdi_add_dt64_array(self, box_df_broadcast_failure): - box = box_df_broadcast_failure - + def test_tdi_add_dt64_array(self, box): dti = pd.date_range('2016-01-01', periods=3) tdi = dti - dti.shift(1) dtarr = dti.values @@ -524,9 +517,8 @@ def test_td64arr_add_int_series_invalid(self, box, tdser): with pytest.raises(err): int_ser - tdser - def test_td64arr_add_intlike(self, box_df_broadcast_failure): + def test_td64arr_add_intlike(self, box): # GH#19123 - box = box_df_broadcast_failure tdi = TimedeltaIndex(['59 days', '59 days', 'NaT']) ser = tm.box_expected(tdi, box) err = TypeError if box is not pd.Index else NullFrequencyError @@ -580,9 +572,6 @@ def test_td64arr_add_sub_numeric_scalar_invalid(self, box, scalar, tdser): # TODO: Add DataFrame in here? ], ids=lambda x: type(x).__name__) def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype, tdser): - if box is pd.DataFrame and not isinstance(vec, Series): - raise pytest.xfail(reason="Tries to broadcast incorrectly") - tdser = tm.box_expected(tdser, box) err = TypeError if box is pd.Index and not dtype.startswith('float'): @@ -655,9 +644,7 @@ def test_timedelta64_operations_with_timedeltas(self): # roundtrip tm.assert_series_equal(result + td2, td1) - def test_td64arr_add_td64_array(self, box_df_broadcast_failure): - box = box_df_broadcast_failure - + def test_td64arr_add_td64_array(self, box): dti = pd.date_range('2016-01-01', periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -671,9 +658,7 @@ def test_td64arr_add_td64_array(self, box_df_broadcast_failure): result = tdarr + tdi tm.assert_equal(result, expected) - def test_td64arr_sub_td64_array(self, box_df_broadcast_failure): - box = box_df_broadcast_failure - + def test_td64arr_sub_td64_array(self, box): dti = pd.date_range('2016-01-01', periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -691,10 +676,13 @@ def test_td64arr_sub_td64_array(self, box_df_broadcast_failure): @pytest.mark.parametrize('names', [(None, None, None), ('Egon', 'Venkman', None), ('NCC1701D', 'NCC1701D', 'NCC1701D')]) - def test_td64arr_add_sub_tdi(self, box_df_broadcast_failure, names): + def test_td64arr_add_sub_tdi(self, box, names): # GH#17250 make sure result dtype is correct # GH#19043 make sure names are propagated correctly - box = box_df_broadcast_failure + if box is pd.DataFrame and names[1] == 'Venkman': + pytest.skip("Name propagation for DataFrame does not behave like " + "it does for Index/Series") + tdi = TimedeltaIndex(['0 days', '1 day'], name=names[0]) ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)], @@ -825,9 +813,12 @@ def test_timedelta64_operations_with_DateOffset(self): @pytest.mark.parametrize('names', [(None, None, None), ('foo', 'bar', None), ('foo', 'foo', 'foo')]) - def test_td64arr_add_offset_index(self, names, box_df_broadcast_failure): + def test_td64arr_add_offset_index(self, names, box): # GH#18849, GH#19744 - box = box_df_broadcast_failure + if box is pd.DataFrame and names[1] == 'bar': + pytest.skip("Name propagation for DataFrame does not behave like " + "it does for Index/Series") + tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], name=names[0]) other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], @@ -838,19 +829,21 @@ def test_td64arr_add_offset_index(self, names, box_df_broadcast_failure): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - with tm.assert_produces_warning(PerformanceWarning): + # The DataFrame operation is transposed and so operates as separate + # scalar operations, which do not issue a PerformanceWarning + warn = PerformanceWarning if box is not pd.DataFrame else None + with tm.assert_produces_warning(warn): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(warn): res2 = other + tdi tm.assert_equal(res2, expected) # TODO: combine with test_td64arr_add_offset_index by parametrizing # over second box? - def test_td64arr_add_offset_array(self, box_df_broadcast_failure): + def test_td64arr_add_offset_array(self, box): # GH#18849 - box = box_df_broadcast_failure tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) @@ -860,20 +853,26 @@ def test_td64arr_add_offset_array(self, box_df_broadcast_failure): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - with tm.assert_produces_warning(PerformanceWarning): + # The DataFrame operation is transposed and so operates as separate + # scalar operations, which do not issue a PerformanceWarning + warn = PerformanceWarning if box is not pd.DataFrame else None + with tm.assert_produces_warning(warn): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(warn): res2 = other + tdi tm.assert_equal(res2, expected) @pytest.mark.parametrize('names', [(None, None, None), ('foo', 'bar', None), ('foo', 'foo', 'foo')]) - def test_td64arr_sub_offset_index(self, names, box_df_broadcast_failure): + def test_td64arr_sub_offset_index(self, names, box): # GH#18824, GH#19744 - box = box_df_broadcast_failure + if box is pd.DataFrame and names[1] == 'bar': + pytest.skip("Name propagation for DataFrame does not behave like " + "it does for Index/Series") + tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], name=names[0]) other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], @@ -885,13 +884,15 @@ def test_td64arr_sub_offset_index(self, names, box_df_broadcast_failure): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - with tm.assert_produces_warning(PerformanceWarning): + # The DataFrame operation is transposed and so operates as separate + # scalar operations, which do not issue a PerformanceWarning + warn = PerformanceWarning if box is not pd.DataFrame else None + with tm.assert_produces_warning(warn): res = tdi - other tm.assert_equal(res, expected) - def test_td64arr_sub_offset_array(self, box_df_broadcast_failure): + def test_td64arr_sub_offset_array(self, box): # GH#18824 - box = box_df_broadcast_failure tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) @@ -901,7 +902,10 @@ def test_td64arr_sub_offset_array(self, box_df_broadcast_failure): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) - with tm.assert_produces_warning(PerformanceWarning): + # The DataFrame operation is transposed and so operates as separate + # scalar operations, which do not issue a PerformanceWarning + warn = PerformanceWarning if box is not pd.DataFrame else None + with tm.assert_produces_warning(warn): res = tdi - other tm.assert_equal(res, expected) @@ -943,9 +947,6 @@ def test_td64arr_with_offset_series(self, names, box_df_fail): @pytest.mark.parametrize('obox', [np.array, pd.Index, pd.Series]) def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box): # GH#18824 - if box is pd.DataFrame and obox is not pd.Series: - raise pytest.xfail(reason="Attempts to broadcast incorrectly") - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) tdi = tm.box_expected(tdi, box) @@ -1023,8 +1024,7 @@ def test_tdi_mul_int_array_zerodim(self, box): result = idx * np.array(5, dtype='int64') tm.assert_equal(result, expected) - def test_tdi_mul_int_array(self, box_df_broadcast_failure): - box = box_df_broadcast_failure + def test_tdi_mul_int_array(self, box): rng5 = np.arange(5, dtype='int64') idx = TimedeltaIndex(rng5) expected = TimedeltaIndex(rng5 ** 2) @@ -1035,8 +1035,7 @@ def test_tdi_mul_int_array(self, box_df_broadcast_failure): result = idx * rng5 tm.assert_equal(result, expected) - def test_tdi_mul_int_series(self, box_df_fail): - box = box_df_fail + def test_tdi_mul_int_series(self, box): idx = TimedeltaIndex(np.arange(5, dtype='int64')) expected = TimedeltaIndex(np.arange(5, dtype='int64') ** 2) @@ -1048,17 +1047,16 @@ def test_tdi_mul_int_series(self, box_df_fail): result = idx * pd.Series(np.arange(5, dtype='int64')) tm.assert_equal(result, expected) - def test_tdi_mul_float_series(self, box_df_fail): - box = box_df_fail + def test_tdi_mul_float_series(self, box): idx = TimedeltaIndex(np.arange(5, dtype='int64')) idx = tm.box_expected(idx, box) rng5f = np.arange(5, dtype='float64') - expected = TimedeltaIndex(rng5f * (rng5f + 0.1)) + expected = TimedeltaIndex(rng5f * (rng5f + 1.0)) box2 = pd.Series if box is pd.Index else box expected = tm.box_expected(expected, box2) - result = idx * Series(rng5f + 0.1) + result = idx * Series(rng5f + 1.0) tm.assert_equal(result, expected) # TODO: Put Series/DataFrame in others? @@ -1069,9 +1067,7 @@ def test_tdi_mul_float_series(self, box_df_fail): pd.Float64Index(range(1, 11)), pd.RangeIndex(1, 11) ], ids=lambda x: type(x).__name__) - def test_tdi_rmul_arraylike(self, other, box_df_broadcast_failure): - box = box_df_broadcast_failure - + def test_tdi_rmul_arraylike(self, other, box): tdi = TimedeltaIndex(['1 Day'] * 10) expected = timedelta_range('1 days', '10 days') @@ -1131,8 +1127,8 @@ def test_td64arr_floordiv_tdscalar(self, box, scalar_td): expected = Series([0, 0, np.nan]) - td1 = tm.box_expected(td1, box) - expected = tm.box_expected(expected, box) + td1 = tm.box_expected(td1, box, transpose=False) + expected = tm.box_expected(expected, box, transpose=False) result = td1 // scalar_td tm.assert_equal(result, expected) @@ -1144,8 +1140,8 @@ def test_td64arr_rfloordiv_tdscalar(self, box, scalar_td): expected = Series([1, 1, np.nan]) - td1 = tm.box_expected(td1, box) - expected = tm.box_expected(expected, box) + td1 = tm.box_expected(td1, box, transpose=False) + expected = tm.box_expected(expected, box, transpose=False) result = scalar_td // td1 tm.assert_equal(result, expected) @@ -1157,8 +1153,8 @@ def test_td64arr_rfloordiv_tdscalar_explicit(self, box, scalar_td): expected = Series([1, 1, np.nan]) - td1 = tm.box_expected(td1, box) - expected = tm.box_expected(expected, box) + td1 = tm.box_expected(td1, box, transpose=False) + expected = tm.box_expected(expected, box, transpose=False) # We can test __rfloordiv__ using this syntax, # see `test_timedelta_rfloordiv` @@ -1192,14 +1188,14 @@ def test_td64arr_rfloordiv_tdlike_scalar(self, scalar_td, box): tdi = TimedeltaIndex(['00:05:03', '00:05:03', pd.NaT], freq=None) expected = pd.Index([2.0, 2.0, np.nan]) - tdi = tm.box_expected(tdi, box) - expected = tm.box_expected(expected, box) + tdi = tm.box_expected(tdi, box, transpose=False) + expected = tm.box_expected(expected, box, transpose=False) res = tdi.__rfloordiv__(scalar_td) tm.assert_equal(res, expected) expected = pd.Index([0.0, 0.0, np.nan]) - expected = tm.box_expected(expected, box) + expected = tm.box_expected(expected, box, transpose=False) res = tdi // (scalar_td) tm.assert_equal(res, expected) @@ -1283,11 +1279,10 @@ def test_td64arr_div_numeric_scalar(self, box, two, tdser): Series([20, 30, 40])], ids=lambda x: type(x).__name__) @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) - def test_td64arr_rmul_numeric_array(self, op, box_df_fail, + def test_td64arr_rmul_numeric_array(self, op, box, vector, dtype, tdser): # GH#4521 # divide/multiply by integers - box = box_df_fail # broadcasts incorrectly but doesn't raise vector = vector.astype(dtype) expected = Series(['1180 Days', '1770 Days', 'NaT'], @@ -1301,14 +1296,6 @@ def test_td64arr_rmul_numeric_array(self, op, box_df_fail, result = op(vector, tdser) tm.assert_equal(result, expected) - @pytest.mark.parametrize('box', [ - pd.Index, - Series, - pytest.param(pd.DataFrame, - marks=pytest.mark.xfail(reason="broadcasts along " - "wrong axis", - strict=True)) - ], ids=lambda x: x.__name__) @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', 'uint64', 'uint32', 'uint16', 'uint8', 'float64', 'float32', 'float16']) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 04d7f4d498c2b..c15696705ab82 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -3,7 +3,6 @@ import warnings import numpy as np -from numpy import nan import pytest from pandas._libs.sparse import IntIndex @@ -24,7 +23,8 @@ def kind(request): class TestSparseArray(object): def setup_method(self, method): - self.arr_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) + self.arr_data = np.array([np.nan, np.nan, 1, 2, 3, + np.nan, 4, 5, np.nan, 6]) self.arr = SparseArray(self.arr_data) self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 3fd03a351de7c..bb4022c9cac9a 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,7 +4,8 @@ import pandas as pd from pandas.core.arrays import ( - DatetimeArrayMixin, PeriodArray, TimedeltaArrayMixin) + DatetimeArrayMixin as DatetimeArray, PeriodArray, + TimedeltaArrayMixin as TimedeltaArray) import pandas.util.testing as tm @@ -57,10 +58,58 @@ def timedelta_index(request): class TestDatetimeArray(object): + def test_array_object_dtype(self, tz_naive_fixture): + # GH#23524 + tz = tz_naive_fixture + dti = pd.date_range('2016-01-01', periods=3, tz=tz) + arr = DatetimeArray(dti) + + expected = np.array(list(dti)) + + result = np.array(arr, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # also test the DatetimeIndex method while we're at it + result = np.array(dti, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_array(self, tz_naive_fixture): + # GH#23524 + tz = tz_naive_fixture + dti = pd.date_range('2016-01-01', periods=3, tz=tz) + arr = DatetimeArray(dti) + + expected = dti.asi8.view('M8[ns]') + result = np.array(arr) + tm.assert_numpy_array_equal(result, expected) + + # check that we are not making copies when setting copy=False + result = np.array(arr, copy=False) + assert result.base is expected.base + assert result.base is not None + + def test_array_i8_dtype(self, tz_naive_fixture): + # GH#23524 + tz = tz_naive_fixture + dti = pd.date_range('2016-01-01', periods=3, tz=tz) + arr = DatetimeArray(dti) + + expected = dti.asi8 + result = np.array(arr, dtype='i8') + tm.assert_numpy_array_equal(result, expected) + + result = np.array(arr, dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + # check that we are not making copies when setting copy=False + result = np.array(arr, dtype='i8', copy=False) + assert result.base is expected.base + assert result.base is not None + def test_from_dti(self, tz_naive_fixture): tz = tz_naive_fixture dti = pd.date_range('2016-01-01', periods=3, tz=tz) - arr = DatetimeArrayMixin(dti) + arr = DatetimeArray(dti) assert list(dti) == list(arr) # Check that Index.__new__ knows what to do with DatetimeArray @@ -71,7 +120,7 @@ def test_from_dti(self, tz_naive_fixture): def test_astype_object(self, tz_naive_fixture): tz = tz_naive_fixture dti = pd.date_range('2016-01-01', periods=3, tz=tz) - arr = DatetimeArrayMixin(dti) + arr = DatetimeArray(dti) asobj = arr.astype('O') assert isinstance(asobj, np.ndarray) assert asobj.dtype == 'O' @@ -81,11 +130,11 @@ def test_astype_object(self, tz_naive_fixture): def test_to_perioddelta(self, datetime_index, freqstr): # GH#23113 dti = datetime_index - arr = DatetimeArrayMixin(dti) + arr = DatetimeArray(dti) expected = dti.to_perioddelta(freq=freqstr) result = arr.to_perioddelta(freq=freqstr) - assert isinstance(result, TimedeltaArrayMixin) + assert isinstance(result, TimedeltaArray) # placeholder until these become actual EA subclasses and we can use # an EA-specific tm.assert_ function @@ -94,7 +143,7 @@ def test_to_perioddelta(self, datetime_index, freqstr): @pytest.mark.parametrize('freqstr', ['D', 'B', 'W', 'M', 'Q', 'Y']) def test_to_period(self, datetime_index, freqstr): dti = datetime_index - arr = DatetimeArrayMixin(dti) + arr = DatetimeArray(dti) expected = dti.to_period(freq=freqstr) result = arr.to_period(freq=freqstr) @@ -108,7 +157,7 @@ def test_to_period(self, datetime_index, freqstr): def test_bool_properties(self, datetime_index, propname): # in this case _bool_ops is just `is_leap_year` dti = datetime_index - arr = DatetimeArrayMixin(dti) + arr = DatetimeArray(dti) assert dti.freq == arr.freq result = getattr(arr, propname) @@ -119,7 +168,7 @@ def test_bool_properties(self, datetime_index, propname): @pytest.mark.parametrize('propname', pd.DatetimeIndex._field_ops) def test_int_properties(self, datetime_index, propname): dti = datetime_index - arr = DatetimeArrayMixin(dti) + arr = DatetimeArray(dti) result = getattr(arr, propname) expected = np.array(getattr(dti, propname), dtype=result.dtype) @@ -130,7 +179,7 @@ def test_int_properties(self, datetime_index, propname): class TestTimedeltaArray(object): def test_from_tdi(self): tdi = pd.TimedeltaIndex(['1 Day', '3 Hours']) - arr = TimedeltaArrayMixin(tdi) + arr = TimedeltaArray(tdi) assert list(arr) == list(tdi) # Check that Index.__new__ knows what to do with TimedeltaArray @@ -140,7 +189,7 @@ def test_from_tdi(self): def test_astype_object(self): tdi = pd.TimedeltaIndex(['1 Day', '3 Hours']) - arr = TimedeltaArrayMixin(tdi) + arr = TimedeltaArray(tdi) asobj = arr.astype('O') assert isinstance(asobj, np.ndarray) assert asobj.dtype == 'O' @@ -148,7 +197,7 @@ def test_astype_object(self): def test_to_pytimedelta(self, timedelta_index): tdi = timedelta_index - arr = TimedeltaArrayMixin(tdi) + arr = TimedeltaArray(tdi) expected = tdi.to_pytimedelta() result = arr.to_pytimedelta() @@ -157,7 +206,7 @@ def test_to_pytimedelta(self, timedelta_index): def test_total_seconds(self, timedelta_index): tdi = timedelta_index - arr = TimedeltaArrayMixin(tdi) + arr = TimedeltaArray(tdi) expected = tdi.total_seconds() result = arr.total_seconds() @@ -167,7 +216,7 @@ def test_total_seconds(self, timedelta_index): @pytest.mark.parametrize('propname', pd.TimedeltaIndex._field_ops) def test_int_properties(self, timedelta_index, propname): tdi = timedelta_index - arr = TimedeltaArrayMixin(tdi) + arr = TimedeltaArray(tdi) result = getattr(arr, propname) expected = np.array(getattr(tdi, propname), dtype=result.dtype) @@ -200,9 +249,9 @@ def test_to_timestamp(self, how, period_index): pi = period_index arr = PeriodArray(pi) - expected = DatetimeArrayMixin(pi.to_timestamp(how=how)) + expected = DatetimeArray(pi.to_timestamp(how=how)) result = arr.to_timestamp(how=how) - assert isinstance(result, DatetimeArrayMixin) + assert isinstance(result, DatetimeArray) # placeholder until these become actual EA subclasses and we can use # an EA-specific tm.assert_ function diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index a15295cfbd81a..2b630b98b69a2 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """ Tests for DatetimeArray """ diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index c0b3ab3592188..173f9707e76c2 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -317,11 +317,11 @@ def test_rpow_one_to_na(self): class TestComparisonOps(BaseOpsUtil): - def _compare_other(self, s, data, op_name, other): + def _compare_other(self, data, op_name, other): op = self.get_op_from_name(op_name) # array - result = op(s, other) + result = pd.Series(op(data, other)) expected = pd.Series(op(data._data, other)) # fill the nan locations @@ -343,14 +343,12 @@ def _compare_other(self, s, data, op_name, other): def test_compare_scalar(self, data, all_compare_operators): op_name = all_compare_operators - s = pd.Series(data) - self._compare_other(s, data, op_name, 0) + self._compare_other(data, op_name, 0) def test_compare_array(self, data, all_compare_operators): op_name = all_compare_operators - s = pd.Series(data) other = pd.Series([0] * len(data)) - self._compare_other(s, data, op_name, other) + self._compare_other(data, op_name, other) class TestCasting(object): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index f4fb8f2814a70..bf139bb0ce616 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -190,7 +190,7 @@ def test_setitem_raises_type(): # ---------------------------------------------------------------------------- # Ops -def tet_sub_period(): +def test_sub_period(): arr = period_array(['2000', '2001'], freq='D') other = pd.Period("2000", freq="M") with pytest.raises(IncompatibleFrequency, match="freq"): diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py new file mode 100644 index 0000000000000..3ff807daeeab9 --- /dev/null +++ b/pandas/tests/arrays/test_timedeltas.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + + +class TestTimedeltaArray(object): + pass diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 7fd389e19325c..279bfb5dc8eab 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -72,10 +72,10 @@ class TestDtype(base.BaseDtypeTests): class TestInterface(base.BaseInterfaceTests): - @pytest.mark.skip(reason="Memory usage doesn't match") - def test_memory_usage(self): + @pytest.mark.skip(reason="Memory usage doesn't match", strict=True) + def test_memory_usage(self, data): # Is this deliberate? - pass + super(TestInterface, self).test_memory_usage(data) class TestConstructors(base.BaseConstructorsTests): @@ -83,69 +83,56 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - @pytest.mark.skip(reason="Unobserved categories preseved in concat.") - def test_concat_columns(self, data, na_value): - pass - - @pytest.mark.skip(reason="Unobserved categories preseved in concat.") - def test_align(self, data, na_value): - pass - - @pytest.mark.skip(reason="Unobserved categories preseved in concat.") - def test_align_frame(self, data, na_value): - pass - - @pytest.mark.skip(reason="Unobserved categories preseved in concat.") - def test_merge(self, data, na_value): - pass + pass class TestGetitem(base.BaseGetitemTests): - skip_take = pytest.mark.skip(reason="GH-20664.") + skip_take = pytest.mark.skip(reason="GH-20664.", strict=True) - @pytest.mark.skip(reason="Backwards compatibility") - def test_getitem_scalar(self): + @pytest.mark.skip(reason="Backwards compatibility", strict=True) + def test_getitem_scalar(self, data): # CategoricalDtype.type isn't "correct" since it should # be a parent of the elements (object). But don't want # to break things by changing. - pass + super(TestGetitem, self).test_getitem_scalar(data) @skip_take - def test_take(self): + def test_take(self, data, na_value, na_cmp): # TODO remove this once Categorical.take is fixed - pass + super(TestGetitem, self).test_take(data, na_value, na_cmp) @skip_take - def test_take_negative(self): - pass + def test_take_negative(self, data): + super().test_take_negative(data) @skip_take - def test_take_pandas_style_negative_raises(self): - pass + def test_take_pandas_style_negative_raises(self, data, na_value): + super().test_take_pandas_style_negative_raises(data, na_value) @skip_take - def test_take_non_na_fill_value(self): - pass + def test_take_non_na_fill_value(self, data_missing): + super().test_take_non_na_fill_value(data_missing) @skip_take - def test_take_out_of_bounds_raises(self): - pass + def test_take_out_of_bounds_raises(self, data, allow_fill): + return super().test_take_out_of_bounds_raises(data, allow_fill) - @pytest.mark.skip(reason="GH-20747. Unobserved categories.") - def test_take_series(self): - pass + @pytest.mark.skip(reason="GH-20747. Unobserved categories.", strict=True) + def test_take_series(self, data): + super().test_take_series(data) @skip_take - def test_reindex_non_na_fill_value(self): - pass + def test_reindex_non_na_fill_value(self, data_missing): + super().test_reindex_non_na_fill_value(data_missing) - @pytest.mark.skip(reason="Categorical.take buggy") - def test_take_empty(self): - pass + @pytest.mark.skip(reason="Categorical.take buggy", strict=True) + def test_take_empty(self, data, na_value, na_cmp): + super().test_take_empty(data, na_value, na_cmp) - @pytest.mark.skip(reason="test not written correctly for categorical") - def test_reindex(self): - pass + @pytest.mark.skip(reason="test not written correctly for categorical", + strict=True) + def test_reindex(self, data, na_value): + super().test_reindex(data, na_value) class TestSetitem(base.BaseSetitemTests): @@ -154,13 +141,13 @@ class TestSetitem(base.BaseSetitemTests): class TestMissing(base.BaseMissingTests): - @pytest.mark.skip(reason="Not implemented") - def test_fillna_limit_pad(self): - pass + @pytest.mark.skip(reason="Not implemented", strict=True) + def test_fillna_limit_pad(self, data_missing): + super().test_fillna_limit_pad(data_missing) - @pytest.mark.skip(reason="Not implemented") - def test_fillna_limit_backfill(self): - pass + @pytest.mark.skip(reason="Not implemented", strict=True) + def test_fillna_limit_backfill(self, data_missing): + super().test_fillna_limit_backfill(data_missing) class TestReduce(base.BaseNoReduceTests): @@ -168,11 +155,9 @@ class TestReduce(base.BaseNoReduceTests): class TestMethods(base.BaseMethodsTests): - pass - - @pytest.mark.skip(reason="Unobserved categories included") + @pytest.mark.skip(reason="Unobserved categories included", strict=True) def test_value_counts(self, all_data, dropna): - pass + return super().test_value_counts(all_data, dropna) def test_combine_add(self, data_repeated): # GH 20825 @@ -190,9 +175,9 @@ def test_combine_add(self, data_repeated): expected = pd.Series([a + val for a in list(orig_data1)]) self.assert_series_equal(result, expected) - @pytest.mark.skip(reason="Not Applicable") + @pytest.mark.skip(reason="Not Applicable", strict=True) def test_fillna_length_mismatch(self, data_missing): - pass + super().test_fillna_length_mismatch(data_missing) class TestCasting(base.BaseCastingTests): diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index b0e7fe2e25a6c..78aa853f68459 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2277,19 +2277,34 @@ def test_getitem_ix_float_duplicates(self): expect = df.iloc[[1, -1], 0] assert_series_equal(df.loc[0.2, 'a'], expect) + def test_getitem_sparse_column(self): + # https://github.com/pandas-dev/pandas/issues/23559 + data = pd.SparseArray([0, 1]) + df = pd.DataFrame({"A": data}) + expected = pd.Series(data, name="A") + result = df['A'] + tm.assert_series_equal(result, expected) + + result = df.iloc[:, 0] + tm.assert_series_equal(result, expected) + + result = df.loc[:, 'A'] + tm.assert_series_equal(result, expected) + def test_setitem_with_sparse_value(self): # GH8131 df = pd.DataFrame({'c_1': ['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) - sp_series = pd.Series([0, 0, 1]).to_sparse(fill_value=0) - df['new_column'] = sp_series - assert_series_equal(df['new_column'], sp_series, check_names=False) + sp_array = pd.SparseArray([0, 0, 1]) + df['new_column'] = sp_array + assert_series_equal(df['new_column'], + pd.Series(sp_array, name='new_column'), + check_names=False) def test_setitem_with_unaligned_sparse_value(self): df = pd.DataFrame({'c_1': ['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) - sp_series = (pd.Series([0, 0, 1], index=[2, 1, 0]) - .to_sparse(fill_value=0)) + sp_series = pd.Series(pd.SparseArray([0, 0, 1]), index=[2, 1, 0]) df['new_column'] = sp_series - exp = pd.SparseSeries([1, 0, 0], name='new_column') + exp = pd.Series(pd.SparseArray([1, 0, 0]), name='new_column') assert_series_equal(df['new_column'], exp) def test_setitem_with_unaligned_tz_aware_datetime_column(self): diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 89d45639f3e03..bbe4914b5f447 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -6,7 +6,6 @@ import pytest -from numpy import nan import numpy as np from pandas.compat import range @@ -328,7 +327,7 @@ def test_combineFrame(self): frame_copy = self.frame.reindex(self.frame.index[::2]) del frame_copy['D'] - frame_copy['C'][:5] = nan + frame_copy['C'][:5] = np.nan added = self.frame + frame_copy diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 96c523b10197f..07cbb8cdcde0a 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -7,7 +7,6 @@ import sys import textwrap -from numpy import nan import numpy as np import pytest @@ -49,8 +48,8 @@ def test_repr_mixed_big(self): biggie = DataFrame({'A': np.random.randn(200), 'B': tm.makeStringIndex(200)}, index=lrange(200)) - biggie.loc[:20, 'A'] = nan - biggie.loc[:20, 'B'] = nan + biggie.loc[:20, 'A'] = np.nan + biggie.loc[:20, 'B'] = np.nan foo = repr(biggie) # noqa diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 5794630e72419..4f04169d08206 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -6,7 +6,6 @@ import pytest -from numpy import nan from numpy.random import randn import numpy as np @@ -516,8 +515,8 @@ def test_first_last_valid(self, data, idx, expected_first, expected_last): N = len(self.frame.index) mat = randn(N) - mat[:5] = nan - mat[-5:] = nan + mat[:5] = np.nan + mat[-5:] = np.nan frame = DataFrame({'foo': mat}, index=self.frame.index) index = frame.first_valid_index() @@ -533,7 +532,7 @@ def test_first_last_valid(self, data, idx, assert empty.first_valid_index() is None # GH17400: no valid entries - frame[:] = nan + frame[:] = np.nan assert frame.last_valid_index() is None assert frame.first_valid_index() is None diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index b56375d0a8670..cd43cfe34d80b 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -6,7 +6,6 @@ import csv import pytest -from numpy import nan import numpy as np from pandas.compat import (lmap, range, lrange, StringIO, u) @@ -52,7 +51,7 @@ def test_from_csv_deprecation(self): def test_to_csv_from_csv1(self): with ensure_clean('__tmp_to_csv_from_csv1__') as path: - self.frame['A'][:5] = nan + self.frame['A'][:5] = np.nan self.frame.to_csv(path) self.frame.to_csv(path, columns=['A', 'B']) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index d0e1f04238366..a0cc653a28b06 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -9,7 +9,6 @@ import pytest import numpy as np -from numpy import nan import pandas as pd from pandas import (bdate_range, DataFrame, Index, Series, Timestamp, @@ -36,11 +35,11 @@ 'max', ]) def test_cythonized_aggers(op_name): - data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], + data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan], 'B': ['A', 'B'] * 6, 'C': np.random.randn(12)} df = DataFrame(data) - df.loc[2:10:2, 'C'] = nan + df.loc[2:10:2, 'C'] = np.nan op = lambda x: getattr(x, op_name)() diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 04b2c4f280588..42a75f277faa6 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -318,7 +318,8 @@ def test_constructor_coverage(self): pytest.raises(ValueError, DatetimeIndex, start='1/1/2000', end='1/10/2000') - pytest.raises(ValueError, DatetimeIndex, '1/1/2000') + with pytest.raises(TypeError): + DatetimeIndex('1/1/2000') # generator expression gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 666420a6a9b06..4a3efe22926f7 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -132,7 +132,7 @@ def test_construction_list_tuples_nan(self, na_value, vtype): @pytest.mark.parametrize("cast_as_obj", [True, False]) @pytest.mark.parametrize("index", [ pd.date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern'), # DTI with tz + tz='US/Eastern', name='Green Eggs & Ham'), # DTI with tz pd.date_range('2015-01-01 10:00', freq='D', periods=3), # DTI no tz pd.timedelta_range('1 days', freq='D', periods=3), # td pd.period_range('2015-01-01', freq='D', periods=3) # period @@ -145,8 +145,16 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): tm.assert_index_equal(result, index) - if isinstance(index, pd.DatetimeIndex) and hasattr(index, 'tz'): + if isinstance(index, pd.DatetimeIndex): assert result.tz == index.tz + if cast_as_obj: + # GH#23524 check that Index(dti, dtype=object) does not + # incorrectly raise ValueError, and that nanoseconds are not + # dropped + index += pd.Timedelta(nanoseconds=50) + result = pd.Index(index, dtype=object) + assert result.dtype == np.object_ + assert list(result) == list(index) @pytest.mark.parametrize("index,has_tz", [ (pd.date_range('2015-01-01 10:00', freq='D', periods=3, diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index a03698c9ea0de..82337ac37fbee 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -453,10 +453,16 @@ def test_timedelta_ops_with_missing_values(self): # setup s1 = pd.to_timedelta(Series(['00:00:01'])) s2 = pd.to_timedelta(Series(['00:00:02'])) - sn = pd.to_timedelta(Series([pd.NaT])) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # Passing datetime64-dtype data to TimedeltaIndex is deprecated + sn = pd.to_timedelta(Series([pd.NaT])) + df1 = pd.DataFrame(['00:00:01']).apply(pd.to_timedelta) df2 = pd.DataFrame(['00:00:02']).apply(pd.to_timedelta) - dfn = pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # Passing datetime64-dtype data to TimedeltaIndex is deprecated + dfn = pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) + scalar1 = pd.to_timedelta('00:00:01') scalar2 = pd.to_timedelta('00:00:02') timedelta_NaT = pd.to_timedelta('NaT') diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py index 1abda624777c8..074c8904b55b1 100644 --- a/pandas/tests/indexes/timedeltas/test_construction.py +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -5,11 +5,81 @@ import pandas as pd import pandas.util.testing as tm -from pandas import TimedeltaIndex, timedelta_range, to_timedelta +from pandas import TimedeltaIndex, timedelta_range, to_timedelta, Timedelta class TestTimedeltaIndex(object): + def test_int64_nocopy(self): + # GH#23539 check that a copy isn't made when we pass int64 data + # and copy=False + arr = np.arange(10, dtype=np.int64) + tdi = TimedeltaIndex(arr, copy=False) + assert tdi._data.base is arr + + def test_infer_from_tdi(self): + # GH#23539 + # fast-path for inferring a frequency if the passed data already + # has one + tdi = pd.timedelta_range('1 second', periods=10**7, freq='1s') + + result = pd.TimedeltaIndex(tdi, freq='infer') + assert result.freq == tdi.freq + + # check that inferred_freq was not called by checking that the + # value has not been cached + assert "inferred_freq" not in getattr(result, "_cache", {}) + + def test_infer_from_tdi_mismatch(self): + # GH#23539 + # fast-path for invalidating a frequency if the passed data already + # has one and it does not match the `freq` input + tdi = pd.timedelta_range('1 second', periods=100, freq='1s') + + msg = ("Inferred frequency .* from passed values does " + "not conform to passed frequency") + with pytest.raises(ValueError, match=msg): + TimedeltaIndex(tdi, freq='D') + + def test_dt64_data_invalid(self): + # GH#23539 + # passing tz-aware DatetimeIndex raises, naive or ndarray[datetime64] + # does not yet, but will in the future + dti = pd.date_range('2016-01-01', periods=3) + + msg = "cannot be converted to timedelta64" + with pytest.raises(TypeError, match=msg): + TimedeltaIndex(dti.tz_localize('Europe/Brussels')) + + with tm.assert_produces_warning(FutureWarning): + TimedeltaIndex(dti) + + with tm.assert_produces_warning(FutureWarning): + TimedeltaIndex(np.asarray(dti)) + + def test_float64_ns_rounded(self): + # GH#23539 without specifying a unit, floats are regarded as nanos, + # and fractional portions are truncated + tdi = TimedeltaIndex([2.3, 9.7]) + expected = TimedeltaIndex([2, 9]) + tm.assert_index_equal(tdi, expected) + + # integral floats are non-lossy + tdi = TimedeltaIndex([2.0, 9.0]) + expected = TimedeltaIndex([2, 9]) + tm.assert_index_equal(tdi, expected) + + # NaNs get converted to NaT + tdi = TimedeltaIndex([2.0, np.nan]) + expected = TimedeltaIndex([pd.Timedelta(nanoseconds=2), pd.NaT]) + tm.assert_index_equal(tdi, expected) + + def test_float64_unit_conversion(self): + # GH#23539 + tdi = TimedeltaIndex([1.5, 2.25], unit='D') + expected = TimedeltaIndex([Timedelta(days=1.5), Timedelta(days=2.25)]) + tm.assert_index_equal(tdi, expected) + def test_construction_base_constructor(self): arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) @@ -63,7 +133,8 @@ def test_constructor_coverage(self): pytest.raises(ValueError, TimedeltaIndex, start='1 days', end='10 days') - pytest.raises(ValueError, TimedeltaIndex, '1 days') + with pytest.raises(TypeError): + TimedeltaIndex('1 days') # generator expression gen = (timedelta(i) for i in range(10)) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 2fc0a49d789fd..989955c0d7ee7 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -54,8 +54,7 @@ def test_minmax(self): assert pd.isna(getattr(obj, op)()) def test_numpy_minmax(self): - dr = pd.date_range(start='2016-01-15', end='2016-01-20') - td = TimedeltaIndex(np.asarray(dr)) + td = timedelta_range('16815 days', '16820 days', freq='D') assert np.min(td) == Timedelta('16815 days') assert np.max(td) == Timedelta('16820 days') diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4bff39f8c7efc..49a3a3d58672d 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -179,6 +179,65 @@ def test_usecols_str(self, ext): tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) + @pytest.mark.parametrize("usecols", [ + [0, 1, 3], [0, 3, 1], + [1, 0, 3], [1, 3, 0], + [3, 0, 1], [3, 1, 0], + ]) + def test_usecols_diff_positional_int_columns_order(self, ext, usecols): + expected = self.get_csv_refdf("test1")[["A", "C"]] + result = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols=usecols) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.parametrize("usecols", [ + ["B", "D"], ["D", "B"] + ]) + def test_usecols_diff_positional_str_columns_order(self, ext, usecols): + expected = self.get_csv_refdf("test1")[["B", "D"]] + expected.index = range(len(expected)) + + result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_read_excel_without_slicing(self, ext): + expected = self.get_csv_refdf("test1") + result = self.get_exceldf("test1", ext, "Sheet1", index_col=0) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str(self, ext): + expected = self.get_csv_refdf("test1")[["C", "D"]] + result = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols="A,D:E") + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str_invalid(self, ext): + msg = "Invalid column name: E1" + + with pytest.raises(ValueError, match=msg): + self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1") + + def test_index_col_label_error(self, ext): + msg = "list indices must be integers.*, not str" + + with pytest.raises(TypeError, match=msg): + self.get_exceldf("test1", ext, "Sheet1", index_col=["A"], + usecols=["A", "C"]) + + def test_usecols_pass_non_existent_column(self, ext): + msg = ("Usecols do not match columns, " + "columns expected but not found: " + r"\['E'\]") + + with pytest.raises(ValueError, match=msg): + self.get_exceldf("test1", ext, usecols=["E"]) + + def test_usecols_wrong_type(self, ext): + msg = ("'usecols' must either be list-like of " + "all strings, all unicode, all integers or a callable.") + + with pytest.raises(ValueError, match=msg): + self.get_exceldf("test1", ext, usecols=["E1", 0]) + def test_excel_stop_iterator(self, ext): parsed = self.get_exceldf('test2', ext, 'Sheet1') @@ -446,63 +505,48 @@ def test_read_excel_blank_with_header(self, ext): actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') - # GH 12292 : error when read one empty column from excel file - def test_read_one_empty_col_no_header(self, ext): + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([np.nan] * 4)), + (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) + ]) + def test_read_one_empty_col_no_header(self, ext, header, expected): + # xref gh-12292 + filename = "no_header" df = pd.DataFrame( [["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]] ) + with ensure_clean(ext) as path: - df.to_excel(path, 'no_header', index=False, header=False) - actual_header_none = read_excel( - path, - 'no_header', - usecols=[0], - header=None - ) - - actual_header_zero = read_excel( - path, - 'no_header', - usecols=[0], - header=0 - ) - expected = DataFrame() - tm.assert_frame_equal(actual_header_none, expected) - tm.assert_frame_equal(actual_header_zero, expected) + df.to_excel(path, filename, index=False, header=False) + result = read_excel(path, filename, usecols=[0], header=header) - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') - def test_read_one_empty_col_with_header(self, ext): + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([0] + [np.nan] * 4)), + (0, DataFrame([np.nan] * 4)) + ]) + def test_read_one_empty_col_with_header(self, ext, header, expected): + filename = "with_header" df = pd.DataFrame( [["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]] ) + with ensure_clean(ext) as path: df.to_excel(path, 'with_header', index=False, header=True) - actual_header_none = read_excel( - path, - 'with_header', - usecols=[0], - header=None - ) - - actual_header_zero = read_excel( - path, - 'with_header', - usecols=[0], - header=0 - ) - expected_header_none = DataFrame(pd.Series([0], dtype='int64')) - tm.assert_frame_equal(actual_header_none, expected_header_none) - expected_header_zero = DataFrame(columns=[0]) - tm.assert_frame_equal(actual_header_zero, expected_header_zero) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) @td.skip_if_no('openpyxl') @td.skip_if_no('xlwt') @@ -539,29 +583,33 @@ def test_date_conversion_overflow(self, ext): result = self.get_exceldf('testdateoverflow', ext) tm.assert_frame_equal(result, expected) - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 def test_sheet_name_and_sheetname(self, ext): - # GH10559: Minor improvement: Change "sheet_name" to "sheetname" - # GH10969: DOC: Consistent var names (sheetname vs sheet_name) - # GH12604: CLN GH10559 Rename sheetname variable to sheet_name - # GH20920: ExcelFile.parse() and pd.read_xlsx() have different - # behavior for "sheetname" argument - dfref = self.get_csv_refdf('test1') - df1 = self.get_exceldf('test1', ext, - sheet_name='Sheet1') # doc + # gh-10559: Minor improvement: Change "sheet_name" to "sheetname" + # gh-10969: DOC: Consistent var names (sheetname vs sheet_name) + # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name + # gh-20920: ExcelFile.parse() and pd.read_xlsx() have different + # behavior for "sheetname" argument + filename = "test1" + sheet_name = "Sheet1" + + df_ref = self.get_csv_refdf(filename) + df1 = self.get_exceldf(filename, ext, + sheet_name=sheet_name, index_col=0) # doc with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2 = self.get_exceldf('test1', ext, - sheetname='Sheet1') # bkwrd compat + df2 = self.get_exceldf(filename, ext, index_col=0, + sheetname=sheet_name) # backward compat - excel = self.get_excelfile('test1', ext) - df1_parse = excel.parse(sheet_name='Sheet1') # doc + excel = self.get_excelfile(filename, ext) + df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2_parse = excel.parse(sheetname='Sheet1') # bkwrd compat + df2_parse = excel.parse(index_col=0, + sheetname=sheet_name) # backward compat - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - tm.assert_frame_equal(df1_parse, dfref, check_names=False) - tm.assert_frame_equal(df2_parse, dfref, check_names=False) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1_parse, df_ref, check_names=False) + tm.assert_frame_equal(df2_parse, df_ref, check_names=False) def test_sheet_name_both_raises(self, ext): with pytest.raises(TypeError, match="Cannot specify both"): @@ -594,20 +642,24 @@ def test_excel_read_buffer(self, ext): actual = read_excel(xls, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) - @td.skip_if_no('xlwt') - def test_read_xlrd_Book(self, ext): + @td.skip_if_no("xlwt") + def test_read_xlrd_book(self, ext): import xlrd - df = self.frame - with ensure_clean('.xls') as pth: - df.to_excel(pth, "SheetA") + + engine = "xlrd" + sheet_name = "SheetA" + + with ensure_clean(ext) as pth: + df.to_excel(pth, sheet_name) book = xlrd.open_workbook(pth) - with ExcelFile(book, engine="xlrd") as xl: - result = read_excel(xl, "SheetA") + with ExcelFile(book, engine=engine) as xl: + result = read_excel(xl, sheet_name, index_col=0) tm.assert_frame_equal(df, result) - result = read_excel(book, sheet_name="SheetA", engine="xlrd") + result = read_excel(book, sheet_name=sheet_name, + engine=engine, index_col=0) tm.assert_frame_equal(df, result) @tm.network @@ -618,17 +670,18 @@ def test_read_from_http_url(self, ext): local_table = self.get_exceldf('test1', ext) tm.assert_frame_equal(url_table, local_table) - @td.skip_if_no('s3fs') + @td.skip_if_no("s3fs") @td.skip_if_not_us_locale def test_read_from_s3_url(self, ext): - boto3 = pytest.importorskip('boto3') - moto = pytest.importorskip('moto') + moto = pytest.importorskip("moto") + boto3 = pytest.importorskip("boto3") with moto.mock_s3(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="pandas-test") file_name = os.path.join(self.dirpath, 'test1' + ext) - with open(file_name, 'rb') as f: + + with open(file_name, "rb") as f: conn.Bucket("pandas-test").put_object(Key="test1" + ext, Body=f) @@ -695,17 +748,18 @@ def test_reader_closes_file(self, ext): assert f.closed - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") def test_creating_and_reading_multiple_sheets(self, ext): - # Test reading multiple sheets, from a runtime created excel file - # with multiple sheets. - # See PR #9450 - def tdf(sheetname): + # see gh-9450 + # + # Test reading multiple sheets, from a runtime + # created Excel file with multiple sheets. + def tdf(col_sheet_name): d, i = [11, 22, 33], [1, 2, 3] - return DataFrame(d, i, columns=[sheetname]) + return DataFrame(d, i, columns=[col_sheet_name]) - sheets = ['AAA', 'BBB', 'CCC'] + sheets = ["AAA", "BBB", "CCC"] dfs = [tdf(s) for s in sheets] dfs = dict(zip(sheets, dfs)) @@ -714,7 +768,9 @@ def tdf(sheetname): with ExcelWriter(pth) as ew: for sheetname, df in iteritems(dfs): df.to_excel(ew, sheetname) - dfs_returned = read_excel(pth, sheet_name=sheets) + + dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) + for s in sheets: tm.assert_frame_equal(dfs[s], dfs_returned[s]) @@ -756,206 +812,206 @@ def test_reader_seconds(self, ext): tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self, ext): - # GH 4679 - mi = MultiIndex.from_product([['foo', 'bar'], ['a', 'b']]) - mi_file = os.path.join(self.dirpath, 'testmultiindex' + ext) - - expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], - [2, 3.5, pd.Timestamp('2015-01-02'), False], - [3, 4.5, pd.Timestamp('2015-01-03'), False], - [4, 5.5, pd.Timestamp('2015-01-04'), True]], + # see gh-4679 + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) + mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) + + # "mi_column" sheet + expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True]], columns=mi) - actual = read_excel(mi_file, 'mi_column', header=[0, 1]) - tm.assert_frame_equal(actual, expected) - actual = read_excel(mi_file, 'mi_column', header=[0, 1], index_col=0) + actual = read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - expected.columns = ['a', 'b', 'c', 'd'] + # "mi_index" sheet expected.index = mi - actual = read_excel(mi_file, 'mi_index', index_col=[0, 1]) + expected.columns = ["a", "b", "c", "d"] + + actual = read_excel(mi_file, "mi_index", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) + # "both" sheet expected.columns = mi - actual = read_excel(mi_file, 'both', index_col=[0, 1], header=[0, 1]) + + actual = read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) - expected.index = mi.set_names(['ilvl1', 'ilvl2']) - expected.columns = ['a', 'b', 'c', 'd'] - actual = read_excel(mi_file, 'mi_index_name', index_col=[0, 1]) + # "mi_index_name" sheet + expected.columns = ["a", "b", "c", "d"] + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = read_excel(mi_file, "mi_index_name", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) + # "mi_column_name" sheet expected.index = list(range(4)) - expected.columns = mi.set_names(['c1', 'c2']) - actual = read_excel(mi_file, 'mi_column_name', + expected.columns = mi.set_names(["c1", "c2"]) + actual = read_excel(mi_file, "mi_column_name", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - # Issue #11317 + # see gh-11317 + # "name_with_int" sheet expected.columns = mi.set_levels( - [1, 2], level=1).set_names(['c1', 'c2']) - actual = read_excel(mi_file, 'name_with_int', + [1, 2], level=1).set_names(["c1", "c2"]) + + actual = read_excel(mi_file, "name_with_int", index_col=0, header=[0, 1]) tm.assert_frame_equal(actual, expected) - expected.columns = mi.set_names(['c1', 'c2']) - expected.index = mi.set_names(['ilvl1', 'ilvl2']) - actual = read_excel(mi_file, 'both_name', - index_col=[0, 1], header=[0, 1]) - tm.assert_frame_equal(actual, expected) + # "both_name" sheet + expected.columns = mi.set_names(["c1", "c2"]) + expected.index = mi.set_names(["ilvl1", "ilvl2"]) - actual = read_excel(mi_file, 'both_name', + actual = read_excel(mi_file, "both_name", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected) - actual = read_excel(mi_file, 'both_name_skiprows', index_col=[0, 1], + # "both_skiprows" sheet + actual = read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected) - @td.skip_if_no('xlsxwriter') + @td.skip_if_no("xlsxwriter") def test_read_excel_multiindex_empty_level(self, ext): - # GH 12453 - with ensure_clean('.xlsx') as path: + # see gh-12453 + with ensure_clean(ext) as path: df = DataFrame({ - ('One', 'x'): {0: 1}, - ('Two', 'X'): {0: 3}, - ('Two', 'Y'): {0: 7}, - ('Zero', ''): {0: 0} + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0} }) expected = DataFrame({ - ('One', u'x'): {0: 1}, - ('Two', u'X'): {0: 3}, - ('Two', u'Y'): {0: 7}, - ('Zero', 'Unnamed: 3_level_1'): {0: 0} + ("One", u"x"): {0: 1}, + ("Two", u"X"): {0: 3}, + ("Two", u"Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0} }) df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1]) + actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) df = pd.DataFrame({ - ('Beg', ''): {0: 0}, - ('Middle', 'x'): {0: 1}, - ('Tail', 'X'): {0: 3}, - ('Tail', 'Y'): {0: 7} + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} }) expected = pd.DataFrame({ - ('Beg', 'Unnamed: 0_level_1'): {0: 0}, - ('Middle', u'x'): {0: 1}, - ('Tail', u'X'): {0: 3}, - ('Tail', u'Y'): {0: 7} + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", u"x"): {0: 1}, + ("Tail", u"X"): {0: 3}, + ("Tail", u"Y"): {0: 7} }) df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1]) + actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - @td.skip_if_no('xlsxwriter') - def test_excel_multindex_roundtrip(self, ext): - # GH 4679 - with ensure_clean('.xlsx') as pth: - for c_idx_names in [True, False]: - for r_idx_names in [True, False]: - for c_idx_levels in [1, 3]: - for r_idx_levels in [1, 3]: - # column index name can't be serialized unless - # MultiIndex - if (c_idx_levels == 1 and c_idx_names): - continue - - # empty name case current read in as unnamed - # levels, not Nones - check_names = True - if not r_idx_names and r_idx_levels > 1: - check_names = False - - df = mkdf(5, 5, c_idx_names, - r_idx_names, c_idx_levels, - r_idx_levels) - df.to_excel(pth) - act = pd.read_excel( - pth, index_col=list(range(r_idx_levels)), + @td.skip_if_no("xlsxwriter") + @pytest.mark.parametrize("c_idx_names", [True, False]) + @pytest.mark.parametrize("r_idx_names", [True, False]) + @pytest.mark.parametrize("c_idx_levels", [1, 3]) + @pytest.mark.parametrize("r_idx_levels", [1, 3]) + def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels): + # see gh-4679 + with ensure_clean(ext) as pth: + if c_idx_levels == 1 and c_idx_names: + pytest.skip("Column index name cannot be " + "serialized unless it's a MultiIndex") + + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = r_idx_names or r_idx_levels <= 1 + + df = mkdf(5, 5, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels) + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) - tm.assert_frame_equal( - df, act, check_names=check_names) + tm.assert_frame_equal(df, act, check_names=check_names) - df.iloc[0, :] = np.nan - df.to_excel(pth) - act = pd.read_excel( - pth, index_col=list(range(r_idx_levels)), + df.iloc[0, :] = np.nan + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) - tm.assert_frame_equal( - df, act, check_names=check_names) + tm.assert_frame_equal(df, act, check_names=check_names) - df.iloc[-1, :] = np.nan - df.to_excel(pth) - act = pd.read_excel( - pth, index_col=list(range(r_idx_levels)), + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) - tm.assert_frame_equal( - df, act, check_names=check_names) + tm.assert_frame_equal(df, act, check_names=check_names) def test_excel_old_index_format(self, ext): # see gh-4679 - filename = 'test_index_name_pre17' + ext + filename = "test_index_name_pre17" + ext in_file = os.path.join(self.dirpath, filename) # We detect headers to determine if index names exist, so # that "index" name in the "names" version of the data will # now be interpreted as rows that include null data. data = np.array([[None, None, None, None, None], - ['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], - ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], - ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], - ['R3C0', 'R3C1', 'R3C2', 'R3C3', 'R3C4'], - ['R4C0', 'R4C1', 'R4C2', 'R4C3', 'R4C4']]) - columns = ['C_l0_g0', 'C_l0_g1', 'C_l0_g2', 'C_l0_g3', 'C_l0_g4'] - mi = MultiIndex(levels=[['R0', 'R_l0_g0', 'R_l0_g1', - 'R_l0_g2', 'R_l0_g3', 'R_l0_g4'], - ['R1', 'R_l1_g0', 'R_l1_g1', - 'R_l1_g2', 'R_l1_g3', 'R_l1_g4']], + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1", + "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R1", "R_l1_g0", "R_l1_g1", + "R_l1_g2", "R_l1_g3", "R_l1_g4"]], labels=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], names=[None, None]) - si = Index(['R0', 'R_l0_g0', 'R_l0_g1', 'R_l0_g2', - 'R_l0_g3', 'R_l0_g4'], name=None) + si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) - actual = pd.read_excel(in_file, 'single_names') + actual = pd.read_excel(in_file, "single_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi - actual = pd.read_excel(in_file, 'multi_names') + actual = pd.read_excel(in_file, "multi_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # The analogous versions of the "names" version data # where there are explicitly no names for the indices. - data = np.array([['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], - ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], - ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], - ['R3C0', 'R3C1', 'R3C2', 'R3C3', 'R3C4'], - ['R4C0', 'R4C1', 'R4C2', 'R4C3', 'R4C4']]) - columns = ['C_l0_g0', 'C_l0_g1', 'C_l0_g2', 'C_l0_g3', 'C_l0_g4'] - mi = MultiIndex(levels=[['R_l0_g0', 'R_l0_g1', 'R_l0_g2', - 'R_l0_g3', 'R_l0_g4'], - ['R_l1_g0', 'R_l1_g1', 'R_l1_g2', - 'R_l1_g3', 'R_l1_g4']], + data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], + ["R_l1_g0", "R_l1_g1", "R_l1_g2", + "R_l1_g3", "R_l1_g4"]], labels=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], names=[None, None]) - si = Index(['R_l0_g0', 'R_l0_g1', 'R_l0_g2', - 'R_l0_g3', 'R_l0_g4'], name=None) + si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) - actual = pd.read_excel(in_file, 'single_no_names') + actual = pd.read_excel(in_file, "single_no_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi - actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0, 1]) + actual = pd.read_excel(in_file, "multi_no_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) def test_read_excel_bool_header_arg(self, ext): @@ -971,33 +1027,28 @@ def test_read_excel_chunksize(self, ext): pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), chunksize=100) - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") def test_read_excel_parse_dates(self, ext): - # GH 11544, 12051 + # see gh-11544, gh-12051 df = DataFrame( - {'col': [1, 2, 3], - 'date_strings': pd.date_range('2012-01-01', periods=3)}) + {"col": [1, 2, 3], + "date_strings": pd.date_range("2012-01-01", periods=3)}) df2 = df.copy() - df2['date_strings'] = df2['date_strings'].dt.strftime('%m/%d/%Y') + df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") with ensure_clean(ext) as pth: df2.to_excel(pth) - res = read_excel(pth) + res = read_excel(pth, index_col=0) tm.assert_frame_equal(df2, res) - # no index_col specified when parse_dates is True - with tm.assert_produces_warning(): - res = read_excel(pth, parse_dates=True) - tm.assert_frame_equal(df2, res) - - res = read_excel(pth, parse_dates=['date_strings'], index_col=0) + res = read_excel(pth, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) - dateparser = lambda x: pd.datetime.strptime(x, '%m/%d/%Y') - res = read_excel(pth, parse_dates=['date_strings'], - date_parser=dateparser, index_col=0) + date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + res = read_excel(pth, parse_dates=["date_strings"], + date_parser=date_parser, index_col=0) tm.assert_frame_equal(df, res) def test_read_excel_skiprows_list(self, ext): @@ -1106,26 +1157,29 @@ class and any subclasses, on account of the `autouse=True` class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. - def test_excel_sheet_by_name_raise(self, merge_cells, engine, ext): + def test_excel_sheet_by_name_raise(self, *_): import xlrd gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(self.path) + xl = ExcelFile(self.path) - df = read_excel(xl, 0) + df = read_excel(xl, 0, index_col=0) + tm.assert_frame_equal(gt, df) with pytest.raises(xlrd.XLRDError): - read_excel(xl, '0') + read_excel(xl, "0") - def test_excelwriter_contextmanager(self, merge_cells, engine, ext): + def test_excel_writer_context_manager(self, *_): with ExcelWriter(self.path) as writer: - self.frame.to_excel(writer, 'Data1') - self.frame2.to_excel(writer, 'Data2') + self.frame.to_excel(writer, "Data1") + self.frame2.to_excel(writer, "Data2") with ExcelFile(self.path) as reader: - found_df = read_excel(reader, 'Data1') - found_df2 = read_excel(reader, 'Data2') + found_df = read_excel(reader, "Data1", index_col=0) + found_df2 = read_excel(reader, "Data2", index_col=0) + tm.assert_frame_equal(found_df, self.frame) tm.assert_frame_equal(found_df2, self.frame2) @@ -1182,12 +1236,13 @@ def test_mixed(self, merge_cells, engine, ext): recons = read_excel(reader, 'test1', index_col=0) tm.assert_frame_equal(self.mixed_frame, recons) - def test_tsframe(self, merge_cells, engine, ext): + def test_ts_frame(self, *_): df = tm.makeTimeDataFrame()[:5] - df.to_excel(self.path, 'test1') + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + + recons = read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(df, recons) def test_basics_with_nan(self, merge_cells, engine, ext): @@ -1200,21 +1255,25 @@ def test_basics_with_nan(self, merge_cells, engine, ext): @pytest.mark.parametrize("np_type", [ np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, merge_cells, engine, ext, np_type): - # Test np.int values read come back as int (rather than float - # which is Excel's format). + # Test np.int values read come back as int + # (rather than float which is Excel's format). frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) - frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + recons = read_excel(reader, "test1", index_col=0) + int_frame = frame.astype(np.int64) tm.assert_frame_equal(int_frame, recons) - recons2 = read_excel(self.path, 'test1') + + recons2 = read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) - # test with convert_float=False comes back as float + # Test with convert_float=False comes back as float. float_frame = frame.astype(float) - recons = read_excel(self.path, 'test1', convert_float=False) + recons = read_excel(self.path, "test1", + convert_float=False, index_col=0) tm.assert_frame_equal(recons, float_frame, check_index_type=False, check_column_type=False) @@ -1224,25 +1283,31 @@ def test_int_types(self, merge_cells, engine, ext, np_type): def test_float_types(self, merge_cells, engine, ext, np_type): # Test np.float values read come back as float. frame = DataFrame(np.random.random_sample(10), dtype=np_type) - frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1').astype(np_type) + recons = read_excel(reader, "test1", index_col=0).astype(np_type) + tm.assert_frame_equal(frame, recons, check_dtype=False) @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, merge_cells, engine, ext, np_type): # Test np.bool values read come back as float. frame = (DataFrame([1, 0, True, False], dtype=np_type)) - frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1').astype(np_type) + recons = read_excel(reader, "test1", index_col=0).astype(np_type) + tm.assert_frame_equal(frame, recons) - def test_inf_roundtrip(self, merge_cells, engine, ext): + def test_inf_roundtrip(self, *_): frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + recons = read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(frame, recons) def test_sheets(self, merge_cells, engine, ext): @@ -1353,37 +1418,41 @@ def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): tm.assert_frame_equal(result, df) assert result.index.name == 'foo' - def test_excel_roundtrip_datetime(self, merge_cells, engine, ext): + def test_excel_roundtrip_datetime(self, merge_cells, *_): # datetime.date, not sure what to test here exactly tsf = self.tsframe.copy() tsf.index = [x.date() for x in self.tsframe.index] - tsf.to_excel(self.path, 'test1', merge_cells=merge_cells) + tsf.to_excel(self.path, "test1", merge_cells=merge_cells) + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + recons = read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(self.tsframe, recons) - # GH4133 - excel output format strings def test_excel_date_datetime_format(self, merge_cells, engine, ext): + # see gh-4133 + # + # Excel output format strings df = DataFrame([[date(2014, 1, 31), date(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)]], - index=['DATE', 'DATETIME'], columns=['X', 'Y']) + index=["DATE", "DATETIME"], columns=["X", "Y"]) df_expected = DataFrame([[datetime(2014, 1, 31), datetime(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)]], - index=['DATE', 'DATETIME'], columns=['X', 'Y']) + index=["DATE", "DATETIME"], columns=["X", "Y"]) with ensure_clean(ext) as filename2: writer1 = ExcelWriter(self.path) writer2 = ExcelWriter(filename2, - date_format='DD.MM.YYYY', - datetime_format='DD.MM.YYYY HH-MM-SS') + date_format="DD.MM.YYYY", + datetime_format="DD.MM.YYYY HH-MM-SS") - df.to_excel(writer1, 'test1') - df.to_excel(writer2, 'test1') + df.to_excel(writer1, "test1") + df.to_excel(writer2, "test1") writer1.close() writer2.close() @@ -1391,54 +1460,66 @@ def test_excel_date_datetime_format(self, merge_cells, engine, ext): reader1 = ExcelFile(self.path) reader2 = ExcelFile(filename2) - rs1 = read_excel(reader1, 'test1', index_col=None) - rs2 = read_excel(reader2, 'test1', index_col=None) + rs1 = read_excel(reader1, "test1", index_col=0) + rs2 = read_excel(reader2, "test1", index_col=0) tm.assert_frame_equal(rs1, rs2) - # since the reader returns a datetime object for dates, we need - # to use df_expected to check the result + # Since the reader returns a datetime object for dates, + # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) - def test_to_excel_interval_no_labels(self, merge_cells, engine, ext): - # GH19242 - test writing Interval without labels + def test_to_excel_interval_no_labels(self, *_): + # see gh-19242 + # + # Test writing Interval without labels. frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = frame.copy() - frame['new'] = pd.cut(frame[0], 10) - expected['new'] = pd.cut(expected[0], 10).astype(str) - frame.to_excel(self.path, 'test1') + + frame["new"] = pd.cut(frame[0], 10) + expected["new"] = pd.cut(expected[0], 10).astype(str) + + frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + + recons = read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_interval_labels(self, merge_cells, engine, ext): - # GH19242 - test writing Interval with labels + def test_to_excel_interval_labels(self, *_): + # see gh-19242 + # + # Test writing Interval with labels. frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = frame.copy() - intervals = pd.cut(frame[0], 10, labels=['A', 'B', 'C', 'D', 'E', - 'F', 'G', 'H', 'I', 'J']) - frame['new'] = intervals - expected['new'] = pd.Series(list(intervals)) - frame.to_excel(self.path, 'test1') + intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E", + "F", "G", "H", "I", "J"]) + frame["new"] = intervals + expected["new"] = pd.Series(list(intervals)) + + frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + + recons = read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_timedelta(self, merge_cells, engine, ext): - # GH 19242, GH9155 - test writing timedelta to xls + def test_to_excel_timedelta(self, *_): + # see gh-19242, gh-9155 + # + # Test writing timedelta to xls. frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - columns=['A'], - dtype=np.int64 - ) + columns=["A"], dtype=np.int64) expected = frame.copy() - frame['new'] = frame['A'].apply(lambda x: timedelta(seconds=x)) - expected['new'] = expected['A'].apply( + + frame["new"] = frame["A"].apply(lambda x: timedelta(seconds=x)) + expected["new"] = expected["A"].apply( lambda x: timedelta(seconds=x).total_seconds() / float(86400)) - frame.to_excel(self.path, 'test1') + + frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + + recons = read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_periodindex(self, merge_cells, engine, ext): @@ -1543,53 +1624,54 @@ def test_to_excel_multiindex_no_write_index(self, merge_cells, engine, # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3) - def test_to_excel_float_format(self, merge_cells, engine, ext): + def test_to_excel_float_format(self, *_): df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - - df.to_excel(self.path, 'test1', float_format='%.2f') + index=["A", "B"], columns=["X", "Y", "Z"]) + df.to_excel(self.path, "test1", float_format="%.2f") reader = ExcelFile(self.path) - rs = read_excel(reader, 'test1', index_col=None) - xp = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - tm.assert_frame_equal(rs, xp) + result = read_excel(reader, "test1", index_col=0) + + expected = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=["A", "B"], columns=["X", "Y", "Z"]) + tm.assert_frame_equal(result, expected) def test_to_excel_output_encoding(self, merge_cells, engine, ext): - # avoid mixed inferred_type - df = DataFrame([[u'\u0192', u'\u0193', u'\u0194'], - [u'\u0195', u'\u0196', u'\u0197']], - index=[u'A\u0192', u'B'], - columns=[u'X\u0193', u'Y', u'Z']) - - with ensure_clean('__tmp_to_excel_float_format__.' + ext) as filename: - df.to_excel(filename, sheet_name='TestSheet', encoding='utf8') - result = read_excel(filename, 'TestSheet', encoding='utf8') + # Avoid mixed inferred_type. + df = DataFrame([[u"\u0192", u"\u0193", u"\u0194"], + [u"\u0195", u"\u0196", u"\u0197"]], + index=[u"A\u0192", u"B"], + columns=[u"X\u0193", u"Y", u"Z"]) + + with ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: + df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") + result = read_excel(filename, "TestSheet", + encoding="utf8", index_col=0) tm.assert_frame_equal(result, df) def test_to_excel_unicode_filename(self, merge_cells, engine, ext): - with ensure_clean(u('\u0192u.') + ext) as filename: + with ensure_clean(u("\u0192u.") + ext) as filename: try: - f = open(filename, 'wb') + f = open(filename, "wb") except UnicodeEncodeError: - pytest.skip('no unicode file names on this system') + pytest.skip("No unicode file names on this system") else: f.close() df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - - df.to_excel(filename, 'test1', float_format='%.2f') + index=["A", "B"], columns=["X", "Y", "Z"]) + df.to_excel(filename, "test1", float_format="%.2f") reader = ExcelFile(filename) - rs = read_excel(reader, 'test1', index_col=None) - xp = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - tm.assert_frame_equal(rs, xp) + result = read_excel(reader, "test1", index_col=0) + + expected = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=["A", "B"], columns=["X", "Y", "Z"]) + tm.assert_frame_equal(result, expected) # def test_to_excel_header_styling_xls(self, merge_cells, engine, ext): @@ -1691,106 +1773,83 @@ def test_to_excel_unicode_filename(self, merge_cells, engine, ext): # assert ws.cell(maddr).merged # os.remove(filename) - def test_excel_010_hemstring(self, merge_cells, engine, ext): - if merge_cells: - pytest.skip('Skip tests for merged MI format.') + @pytest.mark.parametrize("use_headers", [True, False]) + @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) + @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) + def test_excel_010_hemstring(self, merge_cells, engine, ext, + c_idx_nlevels, r_idx_nlevels, use_headers): - from pandas.util.testing import makeCustomDataframe as mkdf - # ensure limited functionality in 0.10 - # override of #2370 until sorted out in 0.11 + def roundtrip(data, header=True, parser_hdr=0, index=True): + data.to_excel(self.path, header=header, + merge_cells=merge_cells, index=index) - def roundtrip(df, header=True, parser_hdr=0, index=True): - - df.to_excel(self.path, header=header, - merge_cells=merge_cells, index=index) xf = ExcelFile(self.path) - res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) - return res - - nrows = 5 - ncols = 3 - for use_headers in (True, False): - for i in range(1, 4): # row multindex up to nlevel=3 - for j in range(1, 4): # col "" - df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) - - # this if will be removed once multi column excel writing - # is implemented for now fixing #9794 - if j > 1: - with pytest.raises(NotImplementedError): - res = roundtrip(df, use_headers, index=False) - else: - res = roundtrip(df, use_headers) - - if use_headers: - assert res.shape == (nrows, ncols + i) - else: - # first row taken as columns - assert res.shape == (nrows - 1, ncols + i) + return read_excel(xf, xf.sheet_names[0], header=parser_hdr) - # no nans - for r in range(len(res.index)): - for c in range(len(res.columns)): - assert res.iloc[r, c] is not np.nan + # Basic test. + parser_header = 0 if use_headers else None + res = roundtrip(DataFrame([0]), use_headers, parser_header) - res = roundtrip(DataFrame([0])) - assert res.shape == (1, 1) - assert res.iloc[0, 0] is not np.nan - - res = roundtrip(DataFrame([0]), False, None) assert res.shape == (1, 2) assert res.iloc[0, 0] is not np.nan - def test_excel_010_hemstring_raises_NotImplementedError(self, merge_cells, - engine, ext): - # This test was failing only for j>1 and header=False, - # So I reproduced a simple test. - if merge_cells: - pytest.skip('Skip tests for merged MI format.') + # More complex tests with multi-index. + nrows = 5 + ncols = 3 from pandas.util.testing import makeCustomDataframe as mkdf # ensure limited functionality in 0.10 - # override of #2370 until sorted out in 0.11 + # override of gh-2370 until sorted out in 0.11 - def roundtrip2(df, header=True, parser_hdr=0, index=True): + df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels, + c_idx_nlevels=c_idx_nlevels) - df.to_excel(self.path, header=header, - merge_cells=merge_cells, index=index) - xf = ExcelFile(self.path) - res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) - return res + # This if will be removed once multi-column Excel writing + # is implemented. For now fixing gh-9794. + if c_idx_nlevels > 1: + with pytest.raises(NotImplementedError): + roundtrip(df, use_headers, index=False) + else: + res = roundtrip(df, use_headers) - nrows = 5 - ncols = 3 - j = 2 - i = 1 - df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) - with pytest.raises(NotImplementedError): - roundtrip2(df, header=False, index=False) + if use_headers: + assert res.shape == (nrows, ncols + r_idx_nlevels) + else: + # First row taken as columns. + assert res.shape == (nrows - 1, ncols + r_idx_nlevels) + + # No NaNs. + for r in range(len(res.index)): + for c in range(len(res.columns)): + assert res.iloc[r, c] is not np.nan - def test_duplicated_columns(self, merge_cells, engine, ext): - # Test for issue #5235 + def test_duplicated_columns(self, *_): + # see gh-5235 write_frame = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) - colnames = ['A', 'B', 'B'] + col_names = ["A", "B", "B"] - write_frame.columns = colnames - write_frame.to_excel(self.path, 'test1') + write_frame.columns = col_names + write_frame.to_excel(self.path, "test1") + + read_frame = read_excel(self.path, "test1", index_col=0) + read_frame.columns = col_names - read_frame = read_excel(self.path, 'test1') - read_frame.columns = colnames tm.assert_frame_equal(write_frame, read_frame) - # 11007 / #10970 + # see gh-11007, gh-10970 write_frame = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=['A', 'B', 'A', 'B']) - write_frame.to_excel(self.path, 'test1') - read_frame = read_excel(self.path, 'test1') - read_frame.columns = ['A', 'B', 'A', 'B'] + columns=["A", "B", "A", "B"]) + write_frame.to_excel(self.path, "test1") + + read_frame = read_excel(self.path, "test1", index_col=0) + read_frame.columns = ["A", "B", "A", "B"] + tm.assert_frame_equal(write_frame, read_frame) - # 10982 - write_frame.to_excel(self.path, 'test1', index=False, header=False) - read_frame = read_excel(self.path, 'test1', header=None) + # see gh-10982 + write_frame.to_excel(self.path, "test1", index=False, header=False) + read_frame = read_excel(self.path, "test1", header=None) + write_frame.columns = [0, 1, 2, 3] tm.assert_frame_equal(write_frame, read_frame) @@ -1805,36 +1864,40 @@ def test_swapped_columns(self, merge_cells, engine, ext): tm.assert_series_equal(write_frame['A'], read_frame['A']) tm.assert_series_equal(write_frame['B'], read_frame['B']) - def test_invalid_columns(self, merge_cells, engine, ext): - # 10982 - write_frame = DataFrame({'A': [1, 1, 1], - 'B': [2, 2, 2]}) + def test_invalid_columns(self, *_): + # see gh-10982 + write_frame = DataFrame({"A": [1, 1, 1], + "B": [2, 2, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - write_frame.to_excel(self.path, 'test1', columns=['B', 'C']) - expected = write_frame.reindex(columns=['B', 'C']) - read_frame = read_excel(self.path, 'test1') + write_frame.to_excel(self.path, "test1", columns=["B", "C"]) + + expected = write_frame.reindex(columns=["B", "C"]) + read_frame = read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(expected, read_frame) with pytest.raises(KeyError): - write_frame.to_excel(self.path, 'test1', columns=['C', 'D']) + write_frame.to_excel(self.path, "test1", columns=["C", "D"]) - def test_comment_arg(self, merge_cells, engine, ext): - # Re issue #18735 - # Test the comment argument functionality to read_excel + def test_comment_arg(self, *_): + # see gh-18735 + # + # Test the comment argument functionality to read_excel. - # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(self.path, 'test_c') + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], + "B": ["two", "two", "#two"]}) + df.to_excel(self.path, "test_c") + + # Read file without comment arg. + result1 = read_excel(self.path, "test_c", index_col=0) - # Read file without comment arg - result1 = read_excel(self.path, 'test_c') result1.iloc[1, 0] = None result1.iloc[1, 1] = None result1.iloc[2, 1] = None - result2 = read_excel(self.path, 'test_c', comment='#') + + result2 = read_excel(self.path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result1, result2) def test_comment_default(self, merge_cells, engine, ext): @@ -1851,22 +1914,23 @@ def test_comment_default(self, merge_cells, engine, ext): result2 = read_excel(self.path, 'test_c', comment=None) tm.assert_frame_equal(result1, result2) - def test_comment_used(self, merge_cells, engine, ext): - # Re issue #18735 - # Test the comment argument is working as expected when used + def test_comment_used(self, *_): + # see gh-18735 + # + # Test the comment argument is working as expected when used. - # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(self.path, 'test_c') + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], + "B": ["two", "two", "#two"]}) + df.to_excel(self.path, "test_c") - # Test read_frame_comment against manually produced expected output - expected = DataFrame({'A': ['one', None, 'one'], - 'B': ['two', None, None]}) - result = read_excel(self.path, 'test_c', comment='#') + # Test read_frame_comment against manually produced expected output. + expected = DataFrame({"A": ["one", None, "one"], + "B": ["two", None, None]}) + result = read_excel(self.path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result, expected) - def test_comment_emptyline(self, merge_cells, engine, ext): + def test_comment_empty_line(self, merge_cells, engine, ext): # Re issue #18735 # Test that read_excel ignores commented lines at the end of file @@ -1899,64 +1963,69 @@ def test_datetimes(self, merge_cells, engine, ext): tm.assert_series_equal(write_frame['A'], read_frame['A']) - # GH7074 def test_bytes_io(self, merge_cells, engine, ext): + # see gh-7074 bio = BytesIO() df = DataFrame(np.random.randn(10, 2)) - # pass engine explicitly as there is no file path to infer from + + # Pass engine explicitly, as there is no file path to infer from. writer = ExcelWriter(bio, engine=engine) df.to_excel(writer) writer.save() + bio.seek(0) - reread_df = read_excel(bio) + reread_df = read_excel(bio, index_col=0) tm.assert_frame_equal(df, reread_df) - # GH8188 - def test_write_lists_dict(self, merge_cells, engine, ext): - df = DataFrame({'mixed': ['a', ['b', 'c'], {'d': 'e', 'f': 2}], - 'numeric': [1, 2, 3.0], - 'str': ['apple', 'banana', 'cherry']}) + def test_write_lists_dict(self, *_): + # see gh-8188. + df = DataFrame({"mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], + "numeric": [1, 2, 3.0], + "str": ["apple", "banana", "cherry"]}) + df.to_excel(self.path, "Sheet1") + read = read_excel(self.path, "Sheet1", header=0, index_col=0) + expected = df.copy() expected.mixed = expected.mixed.apply(str) - expected.numeric = expected.numeric.astype('int64') + expected.numeric = expected.numeric.astype("int64") - df.to_excel(self.path, 'Sheet1') - read = read_excel(self.path, 'Sheet1', header=0) tm.assert_frame_equal(read, expected) - # GH13347 - def test_true_and_false_value_options(self, merge_cells, engine, ext): - df = pd.DataFrame([['foo', 'bar']], columns=['col1', 'col2']) - expected = df.replace({'foo': True, - 'bar': False}) + def test_true_and_false_value_options(self, *_): + # see gh-13347 + df = pd.DataFrame([["foo", "bar"]], columns=["col1", "col2"]) + expected = df.replace({"foo": True, "bar": False}) df.to_excel(self.path) - read_frame = read_excel(self.path, true_values=['foo'], - false_values=['bar']) + read_frame = read_excel(self.path, true_values=["foo"], + false_values=["bar"], index_col=0) tm.assert_frame_equal(read_frame, expected) - def test_freeze_panes(self, merge_cells, engine, ext): - # GH15160 - expected = DataFrame([[1, 2], [3, 4]], columns=['col1', 'col2']) + def test_freeze_panes(self, *_): + # see gh-15160 + expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) expected.to_excel(self.path, "Sheet1", freeze_panes=(1, 1)) - result = read_excel(self.path) - tm.assert_frame_equal(expected, result) - def test_path_pathlib(self, merge_cells, engine, ext): + result = read_excel(self.path, index_col=0) + tm.assert_frame_equal(result, expected) + + def test_path_path_lib(self, merge_cells, engine, ext): df = tm.makeDataFrame() writer = partial(df.to_excel, engine=engine) - reader = partial(pd.read_excel) + + reader = partial(pd.read_excel, index_col=0) result = tm.round_trip_pathlib(writer, reader, - path="foo.{}".format(ext)) - tm.assert_frame_equal(df, result) + path="foo.{ext}".format(ext=ext)) + tm.assert_frame_equal(result, df) - def test_path_localpath(self, merge_cells, engine, ext): + def test_path_local_path(self, merge_cells, engine, ext): df = tm.makeDataFrame() writer = partial(df.to_excel, engine=engine) - reader = partial(pd.read_excel) + + reader = partial(pd.read_excel, index_col=0) result = tm.round_trip_pathlib(writer, reader, - path="foo.{}".format(ext)) - tm.assert_frame_equal(df, result) + path="foo.{ext}".format(ext=ext)) + tm.assert_frame_equal(result, df) @td.skip_if_no('openpyxl') diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 65709b0eebaf7..79fa49b564ad6 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -506,6 +506,9 @@ def test_td_rfloordiv_numeric_series(self): # TODO: GH-19761. Change to TypeError. ser // td + # ---------------------------------------------------------------- + # Timedelta.__mod__, __rmod__ + def test_mod_timedeltalike(self): # GH#19365 td = Timedelta(hours=37) @@ -545,9 +548,6 @@ def test_mod_offset(self): assert isinstance(result, Timedelta) assert result == Timedelta(hours=2) - # ---------------------------------------------------------------- - # Timedelta.__mod__, __rmod__ - def test_mod_numeric(self): # GH#19365 td = Timedelta(hours=37) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 77e43a346c824..4cce26d135443 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -5,7 +5,6 @@ import operator import numpy as np -from numpy import nan import pytest import pandas.compat as compat @@ -750,12 +749,12 @@ def _check_fill(meth, op, a, b, fill_value=0): with np.errstate(all='ignore'): if amask[i]: if bmask[i]: - exp_values.append(nan) + exp_values.append(np.nan) continue exp_values.append(op(fill_value, b[i])) elif bmask[i]: if amask[i]: - exp_values.append(nan) + exp_values.append(np.nan) continue exp_values.append(op(a[i], fill_value)) else: @@ -765,8 +764,8 @@ def _check_fill(meth, op, a, b, fill_value=0): expected = Series(exp_values, exp_index) assert_series_equal(result, expected) - a = Series([nan, 1., 2., 3., nan], index=np.arange(5)) - b = Series([nan, 1, nan, 3, nan, 4.], index=np.arange(6)) + a = Series([np.nan, 1., 2., 3., np.nan], index=np.arange(5)) + b = Series([np.nan, 1, np.nan, 3, np.nan, 4.], index=np.arange(6)) result = op(a, b) exp = equiv_op(a, b) diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 9c7dbd85edcbb..fd5dbcd932993 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -160,11 +160,6 @@ def test_construct_DataFrame_with_sp_series(self): df.dtypes str(df) - tm.assert_sp_series_equal(df['col'], self.bseries, check_names=False) - - result = df.iloc[:, 0] - tm.assert_sp_series_equal(result, self.bseries, check_names=False) - # blocking expected = Series({'col': 'float64:sparse'}) result = df.ftypes diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index cbd3e0903b713..d68dd65c9841b 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -187,6 +187,19 @@ def testMult2(self): assert self.d + (-5 * self._offset(-10)) == self.d + self._offset(50) assert self.d + (-3 * self._offset(-2)) == self.d + self._offset(6) + def test_compare_str(self): + # GH#23524 + # comparing to strings that cannot be cast to DateOffsets should + # not raise for __eq__ or __ne__ + if self._offset is None: + return + off = self._get_offset(self._offset) + + assert not off == "infer" + assert off != "foo" + # Note: inequalities are only implemented for Tick subclasses; + # tests for this are in test_ticks + class TestCommon(Base): # exected value created by Base._get_offset diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index 369c0971f1e9a..128010fe6d32c 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -267,3 +267,25 @@ def test_compare_ticks(cls): assert cls(4) > three assert cls(3) == cls(3) assert cls(3) != cls(4) + + +@pytest.mark.parametrize('cls', tick_classes) +def test_compare_ticks_to_strs(cls): + # GH#23524 + off = cls(19) + + # These tests should work with any strings, but we particularly are + # interested in "infer" as that comparison is convenient to make in + # Datetime/Timedelta Array/Index constructors + assert not off == "infer" + assert not "foo" == off + + for left, right in [("infer", off), (off, "infer")]: + with pytest.raises(TypeError): + left < right + with pytest.raises(TypeError): + left <= right + with pytest.raises(TypeError): + left > right + with pytest.raises(TypeError): + left >= right diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 6fb562e301ac2..25c419e485db1 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -807,7 +807,6 @@ class CustomBusinessDay(_CustomMixin, BusinessDay): Parameters ---------- n : int, default 1 - offset : timedelta, default timedelta(0) normalize : bool, default False Normalize start/end dates to midnight before generating date range weekmask : str, Default 'Mon Tue Wed Thu Fri' @@ -816,6 +815,7 @@ class CustomBusinessDay(_CustomMixin, BusinessDay): list/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` calendar : pd.HolidayCalendar or np.busdaycalendar + offset : timedelta, default timedelta(0) """ _prefix = 'C' _attributes = frozenset(['n', 'normalize', @@ -958,7 +958,6 @@ class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): Parameters ---------- n : int, default 1 - offset : timedelta, default timedelta(0) normalize : bool, default False Normalize start/end dates to midnight before generating date range weekmask : str, Default 'Mon Tue Wed Thu Fri' @@ -967,6 +966,7 @@ class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): list/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` calendar : pd.HolidayCalendar or np.busdaycalendar + offset : timedelta, default timedelta(0) """ _attributes = frozenset(['n', 'normalize', 'weekmask', 'holidays', 'calendar', 'offset']) @@ -2199,9 +2199,18 @@ def apply_index(self, i): def _tick_comp(op): + assert op not in [operator.eq, operator.ne] + def f(self, other): - return op(self.delta, other.delta) + try: + return op(self.delta, other.delta) + except AttributeError: + # comparing with a non-Tick object + raise TypeError("Invalid comparison between {cls} and {typ}" + .format(cls=type(self).__name__, + typ=type(other).__name__)) + f.__name__ = '__{opname}__'.format(opname=op.__name__) return f @@ -2220,8 +2229,6 @@ def __init__(self, n=1, normalize=False): __ge__ = _tick_comp(operator.ge) __lt__ = _tick_comp(operator.lt) __le__ = _tick_comp(operator.le) - __eq__ = _tick_comp(operator.eq) - __ne__ = _tick_comp(operator.ne) def __add__(self, other): if isinstance(other, Tick): @@ -2242,8 +2249,13 @@ def __add__(self, other): def __eq__(self, other): if isinstance(other, compat.string_types): from pandas.tseries.frequencies import to_offset - - other = to_offset(other) + try: + # GH#23524 if to_offset fails, we are dealing with an + # incomparable type so == is False and != is True + other = to_offset(other) + except ValueError: + # e.g. "infer" + return False if isinstance(other, Tick): return self.delta == other.delta @@ -2258,8 +2270,13 @@ def __hash__(self): def __ne__(self, other): if isinstance(other, compat.string_types): from pandas.tseries.frequencies import to_offset - - other = to_offset(other) + try: + # GH#23524 if to_offset fails, we are dealing with an + # incomparable type so == is False and != is True + other = to_offset(other) + except ValueError: + # e.g. "infer" + return True if isinstance(other, Tick): return self.delta != other.delta diff --git a/pandas/util/testing.py b/pandas/util/testing.py index fd7012c87040f..1fa77f5321038 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1242,18 +1242,18 @@ def assert_series_equal(left, right, check_dtype=True, check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare - check_exact : bool, default False - Whether to compare number exactly. + If int, then specify the digits to compare. check_names : bool, default True Whether to check the Series and Index names attribute. + check_exact : bool, default False + Whether to compare number exactly. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. obj : str, default 'Series' Specify object name being compared, internally used to show appropriate - assertion message + assertion message. """ __tracebackhide__ = True @@ -1587,7 +1587,7 @@ def assert_equal(left, right, **kwargs): raise NotImplementedError(type(left)) -def box_expected(expected, box_cls): +def box_expected(expected, box_cls, transpose=True): """ Helper function to wrap the expected output of a test in a given box_class. @@ -1606,6 +1606,11 @@ def box_expected(expected, box_cls): expected = pd.Series(expected) elif box_cls is pd.DataFrame: expected = pd.Series(expected).to_frame() + if transpose: + # for vector operations, we we need a DataFrame to be a single-row, + # not a single-column, in order to operate against non-DataFrame + # vectors of the same length. + expected = expected.T elif box_cls is PeriodArray: # the PeriodArray constructor is not as flexible as period_array expected = period_array(expected) diff --git a/ci/requirements-optional-pip.txt b/requirements-dev.txt similarity index 63% rename from ci/requirements-optional-pip.txt rename to requirements-dev.txt index 62f1c555d8544..93145d948c218 100644 --- a/ci/requirements-optional-pip.txt +++ b/requirements-dev.txt @@ -1,5 +1,17 @@ -# This file was autogenerated by scripts/convert_deps.py -# Do not modify directly +NumPy +python-dateutil>=2.5.0 +pytz +Cython>=0.28.2 +flake8 +flake8-comprehensions +flake8-rst +hypothesis>=3.58.0 +isort +moto +pytest>=3.6 +setuptools>=24.2.0 +sphinx +sphinxcontrib-spelling beautifulsoup4>=4.2.1 blosc bottleneck>=1.2.0 diff --git a/scripts/convert_deps.py b/scripts/convert_deps.py deleted file mode 100755 index 3ff157e0a0d7b..0000000000000 --- a/scripts/convert_deps.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Convert the conda environment.yaml to a pip requirements.txt -""" -import re -import yaml - -exclude = {'python=3'} -rename = {'pytables': 'tables'} - -with open("ci/environment-dev.yaml") as f: - dev = yaml.load(f) - -with open("ci/requirements-optional-conda.txt") as f: - optional = [x.strip() for x in f.readlines()] - -required = dev['dependencies'] -required = [rename.get(dep, dep) for dep in required if dep not in exclude] -optional = [rename.get(dep, dep) for dep in optional if dep not in exclude] -optional = [re.sub("(?<=[^<>])=", '==', dep) for dep in optional] - - -with open("ci/requirements_dev.txt", 'wt') as f: - f.write("# This file was autogenerated by scripts/convert_deps.py\n") - f.write("# Do not modify directly\n") - f.write('\n'.join(required)) - - -with open("ci/requirements-optional-pip.txt", 'wt') as f: - f.write("# This file was autogenerated by scripts/convert_deps.py\n") - f.write("# Do not modify directly\n") - f.write("\n".join(optional)) diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py new file mode 100755 index 0000000000000..2474214a4a53b --- /dev/null +++ b/scripts/generate_pip_deps_from_conda.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +""" +Convert the conda environment.yml to the pip requirements-dev.txt, +or check that they have the same packages (for the CI) + +Usage: + + Generate `requirements-dev.txt` + $ ./conda_to_pip + + Compare and fail (exit status != 0) if `requirements-dev.txt` has not been + generated with this script: + $ ./conda_to_pip --compare +""" +import argparse +import os +import re +import sys +import yaml + + +EXCLUDE = {'python=3'} +RENAME = {'pytables': 'tables'} + + +def conda_package_to_pip(package): + """ + Convert a conda package to its pip equivalent. + + In most cases they are the same, those are the exceptions: + - Packages that should be excluded (in `EXCLUDE`) + - Packages that should be renamed (in `RENAME`) + - A package requiring a specific version, in conda is defined with a single + equal (e.g. ``pandas=1.0``) and in pip with two (e.g. ``pandas==1.0``) + """ + if package in EXCLUDE: + return + + if package in RENAME: + return RENAME[package] + + return re.sub('(?<=[^<>])=', '==', package).strip() + + +def main(conda_fname, pip_fname, compare=False): + """ + Generate the pip dependencies file from the conda file, or compare that + they are synchronized (``compare=True``). + + Parameters + ---------- + conda_fname : str + Path to the conda file with dependencies (e.g. `environment.yml`). + pip_fname : str + Path to the pip file with dependencies (e.g. `requirements-dev.txt`). + compare : bool, default False + Whether to generate the pip file (``False``) or to compare if the + pip file has been generated with this script and the last version + of the conda file (``True``). + + Returns + ------- + bool + True if the comparison fails, False otherwise + """ + with open(conda_fname) as conda_fd: + deps = yaml.safe_load(conda_fd)['dependencies'] + + pip_content = '\n'.join(filter(None, map(conda_package_to_pip, deps))) + + if compare: + with open(pip_fname) as pip_fd: + return pip_content != pip_fd.read() + else: + with open(pip_fname, 'w') as pip_fd: + pip_fd.write(pip_content) + return False + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + description='convert (or compare) conda file to pip') + argparser.add_argument('--compare', + action='store_true', + help='compare whether the two files are equivalent') + args = argparser.parse_args() + + repo_path = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) + res = main(os.path.join(repo_path, 'environment.yml'), + os.path.join(repo_path, 'requirements-dev.txt'), + compare=args.compare) + if res: + sys.stderr.write('`requirements-dev.txt` has to be generated with ' + '`{}` after `environment.yml` is modified.\n'.format( + sys.argv[0])) + sys.exit(res) diff --git a/setup.cfg b/setup.cfg index 2e07182196d5b..9f5384170a245 100644 --- a/setup.cfg +++ b/setup.cfg @@ -120,9 +120,6 @@ skip= pandas/core/indexes/numeric.py, pandas/core/indexes/interval.py, pandas/core/indexes/multi.py, - pandas/core/indexes/timedeltas.py, - pandas/core/indexes/datetimelike.py, - pandas/core/indexes/datetimes.py, pandas/core/indexes/base.py, pandas/core/indexes/accessors.py, pandas/core/indexes/period.py,