diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000000..cd14db03627 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,30 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +#### MCVE Code Sample + +In order for the maintainers to efficiently understand and prioritize issues, we ask you post a "Minimal, Complete and Verifiable Example" (MCVE): http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports + +```python +# Your code here + +``` + +#### Problem Description + +[this should explain **why** the current behavior is a problem and why the expected output is a better solution.] + +#### Expected Output + +#### Output of ``xr.show_versions()`` + +
+# Paste the output here xr.show_versions() here + +
diff --git a/.pep8speaks.yml b/.pep8speaks.yml index 018003f2223..8d87864e426 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -1,16 +1,6 @@ -# File : .pep8speaks.yml - -# This should be kept in sync with the duplicate config in the [pycodestyle] -# block of setup.cfg. +# https://github.com/OrkoHunter/pep8speaks for more info +# pep8speaks will use the flake8 configs in `setup.cfg` scanner: - diff_only: False # If True, errors caused by only the patch are shown - -pycodestyle: - max-line-length: 79 - ignore: # Errors and warnings to ignore - - E402 # module level import not at top of file - - E731 # do not assign a lambda expression, use a def - - E741 # ambiguous variable name - - W503 # line break before binary operator - - W504 # line break after binary operator + diff_only: False + linter: flake8 diff --git a/.travis.yml b/.travis.yml index 155c0271b30..efa903f5083 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,6 +22,7 @@ matrix: - env: CONDA_ENV=py36-zarr-dev - env: CONDA_ENV=docs - env: CONDA_ENV=lint + - env: CONDA_ENV=typing - env: CONDA_ENV=py36-hypothesis allow_failures: @@ -30,6 +31,7 @@ matrix: - EXTRA_FLAGS="--run-flaky --run-network-tests" - env: CONDA_ENV=py36-pandas-dev - env: CONDA_ENV=py36-zarr-dev + - env: CONDA_ENV=typing before_install: - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; @@ -40,9 +42,10 @@ before_install: - conda info -a install: - - if [[ "$CONDA_ENV" == "docs" ]]; then + - | + if [[ "$CONDA_ENV" == "docs" ]]; then conda env create -n test_env --file doc/environment.yml; - elif [[ "$CONDA_ENV" == "lint" ]]; then + elif [[ "$CONDA_ENV" == "lint" ]] || [[ "$CONDA_ENV" == "typing" ]] ; then conda env create -n test_env --file ci/requirements-py37.yml; else conda env create -n test_env --file ci/requirements-$CONDA_ENV.yml; @@ -56,11 +59,14 @@ script: - which python - python --version - python -OO -c "import xarray" - - if [[ "$CONDA_ENV" == "docs" ]]; then + - | + if [[ "$CONDA_ENV" == "docs" ]]; then cd doc; sphinx-build -n -j auto -b html -d _build/doctrees . _build/html; elif [[ "$CONDA_ENV" == "lint" ]]; then - pycodestyle xarray ; + flake8 ; + elif [[ "$CONDA_ENV" == "typing" ]]; then + mypy . ; elif [[ "$CONDA_ENV" == "py36-hypothesis" ]]; then pytest properties ; else diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py index 997fdfd0db0..d0eb6282fce 100644 --- a/asv_bench/benchmarks/__init__.py +++ b/asv_bench/benchmarks/__init__.py @@ -1,6 +1,5 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function +from __future__ import absolute_import, division, print_function + import itertools import numpy as np diff --git a/ci/requirements-py36.yml b/ci/requirements-py36.yml index 03242426a36..aab926ac6aa 100644 --- a/ci/requirements-py36.yml +++ b/ci/requirements-py36.yml @@ -14,7 +14,7 @@ dependencies: - pytest-cov - pytest-env - coveralls - - pycodestyle + - flake8 - numpy>=1.12 - pandas>=0.19 - scipy @@ -24,12 +24,11 @@ dependencies: - bottleneck - zarr - pseudonetcdf>=3.0.1 - - eccodes + - cfgrib>=0.9.2 - cdms2 - pynio - iris>=1.10 - pydap - lxml - pip: - - cfgrib>=0.9.2 - mypy==0.660 diff --git a/ci/requirements-py37.yml b/ci/requirements-py37.yml index 0cece4ed6dd..723ad24d24d 100644 --- a/ci/requirements-py37.yml +++ b/ci/requirements-py37.yml @@ -15,7 +15,7 @@ dependencies: - pytest-cov - pytest-env - coveralls - - pycodestyle + - flake8 - numpy>=1.12 - pandas>=0.19 - scipy @@ -25,9 +25,9 @@ dependencies: - bottleneck - zarr - pseudonetcdf>=3.0.1 + - cfgrib>=0.9.2 - lxml - - eccodes - pydap - pip: - - cfgrib>=0.9.2 - mypy==0.650 + - numbagg diff --git a/conftest.py b/conftest.py index d7f4e0c89bc..ffceb27e753 100644 --- a/conftest.py +++ b/conftest.py @@ -1,5 +1,7 @@ """Configuration for pytest.""" +import pytest + def pytest_addoption(parser): """Add command-line flags for pytest.""" @@ -7,3 +9,21 @@ def pytest_addoption(parser): help="runs flaky tests") parser.addoption("--run-network-tests", action="store_true", help="runs tests requiring a network connection") + + +def pytest_collection_modifyitems(config, items): + + if not config.getoption("--run-flaky"): + skip_flaky = pytest.mark.skip( + reason="set --run-flaky option to run flaky tests") + for item in items: + if "flaky" in item.keywords: + item.add_marker(skip_flaky) + + if not config.getoption("--run-network-tests"): + skip_network = pytest.mark.skip( + reason="set --run-network-tests option to run tests requiring an" + "internet connection") + for item in items: + if "network" in item.keywords: + item.add_marker(skip_network) diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 4b2fed8be37..8f82b30a442 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -153,3 +153,4 @@ CFTimeIndex.shift CFTimeIndex.to_datetimeindex + CFTimeIndex.strftime diff --git a/doc/api.rst b/doc/api.rst index 00b33959eed..258d1748c1b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -148,6 +148,7 @@ Computation Dataset.groupby Dataset.groupby_bins Dataset.rolling + Dataset.rolling_exp Dataset.coarsen Dataset.resample Dataset.diff @@ -189,6 +190,7 @@ Computation :py:attr:`~core.groupby.DatasetGroupBy.last` :py:attr:`~core.groupby.DatasetGroupBy.fillna` :py:attr:`~core.groupby.DatasetGroupBy.where` +:py:attr:`~core.groupby.DatasetGroupBy.quantile` Reshaping and reorganizing -------------------------- @@ -315,6 +317,7 @@ Computation DataArray.groupby DataArray.groupby_bins DataArray.rolling + DataArray.rolling_exp DataArray.coarsen DataArray.dt DataArray.resample @@ -324,6 +327,7 @@ Computation DataArray.quantile DataArray.differentiate DataArray.integrate + DataArray.str **Aggregation**: :py:attr:`~DataArray.all` @@ -359,7 +363,7 @@ Computation :py:attr:`~core.groupby.DataArrayGroupBy.last` :py:attr:`~core.groupby.DataArrayGroupBy.fillna` :py:attr:`~core.groupby.DataArrayGroupBy.where` - +:py:attr:`~core.groupby.DataArrayGroupBy.quantile` Reshaping and reorganizing -------------------------- @@ -460,6 +464,7 @@ Dataset methods :toctree: generated/ open_dataset + load_dataset open_mfdataset open_rasterio open_zarr @@ -487,6 +492,7 @@ DataArray methods :toctree: generated/ open_dataarray + load_dataarray DataArray.to_dataset DataArray.to_netcdf DataArray.to_pandas @@ -532,6 +538,7 @@ Rolling objects core.rolling.DatasetRolling core.rolling.DatasetRolling.construct core.rolling.DatasetRolling.reduce + core.rolling_exp.RollingExp Resample objects ================ @@ -555,6 +562,15 @@ Resample objects also implement the GroupBy interface core.resample.DatasetResample.nearest core.resample.DatasetResample.pad +Accessors +========= + +.. autosummary:: + :toctree: generated/ + + core.accessor_dt.DatetimeAccessor + core.accessor_str.StringAccessor + Custom Indexes ============== .. autosummary:: diff --git a/doc/computation.rst b/doc/computation.rst index 2d41479f67f..b06d7959504 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -45,6 +45,12 @@ Use :py:func:`~xarray.where` to conditionally switch between values: xr.where(arr > 0, 'positive', 'negative') +Use `@` to perform matrix multiplication: + +.. ipython:: python + + arr @ arr + Data arrays also implement many :py:class:`numpy.ndarray` methods: .. ipython:: python @@ -143,20 +149,35 @@ name of the dimension as a key (e.g. ``y``) and the window size as the value arr.rolling(y=3) -The label position and minimum number of periods in the rolling window are -controlled by the ``center`` and ``min_periods`` arguments: +Aggregation and summary methods can be applied directly to the ``Rolling`` +object: .. ipython:: python - arr.rolling(y=3, min_periods=2, center=True) + r = arr.rolling(y=3) + r.reduce(np.std) + r.mean() -Aggregation and summary methods can be applied directly to the ``Rolling`` object: +Aggregation results are assigned the coordinate at the end of each window by +default, but can be centered by passing ``center=True`` when constructing the + ``Rolling`` object: .. ipython:: python - r = arr.rolling(y=3) + r = arr.rolling(y=3, center=True) + r.mean() + +As can be seen above, aggregations of windows which overlap the border of the +array produce ``nan``s. Setting ``min_periods`` in the call to ``rolling`` +changes the minimum number of observations within the window required to have +a value when aggregating: + +.. ipython:: python + + r = arr.rolling(y=3, min_periods=2) + r.mean() + r = arr.rolling(y=3, center=True, min_periods=2) r.mean() - r.reduce(np.std) Note that rolling window aggregations are faster when bottleneck_ is installed. @@ -169,6 +190,22 @@ We can also manually iterate through ``Rolling`` objects: for label, arr_window in r: # arr_window is a view of x +.. _comput.rolling_exp: + +While ``rolling`` provides a simple moving average, ``DataArray`` also supports +an exponential moving average with :py:meth:`~xarray.DataArray.rolling_exp`. +This is similiar to pandas' ``ewm`` method. numbagg_ is required. + +.. _numbagg: https://github.com/shoyer/numbagg + +.. code:: python + + arr.rolling_exp(y=3).mean() + +The ``rolling_exp`` method takes a ``window_type`` kwarg, which can be ``'alpha'``, +``'com'`` (for ``center-of-mass``), ``'span'``, and ``'halflife'``. The default is +``span``. + Finally, the rolling object has a ``construct`` method which returns a view of the original ``DataArray`` with the windowed dimension in the last position. diff --git a/doc/conf.py b/doc/conf.py index 322741556b6..237669460b2 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -13,11 +13,11 @@ # serve to show the default. from __future__ import absolute_import, division, print_function -from contextlib import suppress import datetime import os import subprocess import sys +from contextlib import suppress import xarray diff --git a/doc/contributing.rst b/doc/contributing.rst index fba09497abe..651c1d47db5 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -351,11 +351,11 @@ the more common ``PEP8`` issues: - passing arguments should have spaces after commas, e.g. ``foo(arg1, arg2, kw1='bar')`` :ref:`Continuous Integration ` will run -the `pycodestyle `_ tool +the `flake8 `_ tool and report any stylistic errors in your code. Therefore, it is helpful before -submitting code to run the check yourself:: +submitting code to run the check yourself: - pycodestyle xarray + flake8 Other recommended but optional tools for checking code quality (not currently enforced in CI): @@ -363,8 +363,6 @@ enforced in CI): - `mypy `_ performs static type checking, which can make it easier to catch bugs. Please run ``mypy xarray`` if you annotate any code with `type hints `_. -- `flake8 `_ includes a few more automated - checks than those enforced by pycodestyle. - `isort `_ will highlight incorrectly sorted imports. ``isort -y`` will automatically fix them. See also `flake8-isort `_. diff --git a/doc/examples/_code/weather_data_setup.py b/doc/examples/_code/weather_data_setup.py index 89470542d5a..d3a3e2d065a 100644 --- a/doc/examples/_code/weather_data_setup.py +++ b/doc/examples/_code/weather_data_setup.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -import seaborn as sns # pandas aware plotting library +import seaborn as sns # noqa, pandas aware plotting library import xarray as xr diff --git a/doc/installing.rst b/doc/installing.rst index f624da18611..b9d1b4d0ba4 100644 --- a/doc/installing.rst +++ b/doc/installing.rst @@ -45,6 +45,8 @@ For accelerating xarray - `bottleneck `__: speeds up NaN-skipping and rolling window aggregations by a large factor (1.1 or later) +- `numbagg `_: for exponential rolling + window operations For parallel computing ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/time-series.rst b/doc/time-series.rst index 53efcd45ba2..e198887dd0d 100644 --- a/doc/time-series.rst +++ b/doc/time-series.rst @@ -152,6 +152,15 @@ __ http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases ds['time'].dt.floor('D') +The ``.dt`` accessor can also be used to generate formatted datetime strings +for arrays utilising the same formatting as the standard `datetime.strftime`_. + +.. _datetime.strftime: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior + +.. ipython:: python + + ds['time'].dt.strftime('%a, %b %d %H:%M') + .. _resampling: Resampling and grouped operations diff --git a/doc/weather-climate.rst b/doc/weather-climate.rst index 1950ba62ffb..a17ecd2f2a4 100644 --- a/doc/weather-climate.rst +++ b/doc/weather-climate.rst @@ -71,6 +71,18 @@ instance, we can create the same dates and DataArray we created above using: dates = xr.cftime_range(start='0001', periods=24, freq='MS', calendar='noleap') da = xr.DataArray(np.arange(24), coords=[dates], dims=['time'], name='foo') +With :py:meth:`~xarray.CFTimeIndex.strftime` we can also easily generate formatted strings from +the datetime values of a :py:class:`~xarray.CFTimeIndex` directly or through the +:py:meth:`~xarray.DataArray.dt` accessor for a :py:class:`~xarray.DataArray` +using the same formatting as the standard `datetime.strftime`_ convention . + +.. _datetime.strftime: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior + +.. ipython:: python + + dates.strftime('%c') + da['time'].dt.strftime('%Y%m%d') + For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: - `Partial datetime string indexing`_ using strictly `ISO 8601-format`_ partial diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a2ade467eef..a24c0da4445 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,14 +21,57 @@ v0.12.2 (unreleased) Enhancements ~~~~~~~~~~~~ +- New :py:meth:`~xarray.GroupBy.quantile` method. (:issue:`3018`) + By `David Huard `_. +- Add ``keepdims`` argument for reduce operations (:issue:`2170`) + By `Scott Wales `_. +- netCDF chunksizes are now only dropped when original_shape is different, + not when it isn't found. (:issue:`2207`) + By `Karel van de Plassche `_. +- Enable `@` operator for DataArray. This is equivalent to :py:meth:`DataArray.dot` + By `Maximilian Roos `_. - Add ``fill_value`` argument for reindex, align, and merge operations to enable custom fill values. (:issue:`2876`) By `Zach Griffith `_. +- :py:meth:`~xarray.DataArray.rolling_exp` and + :py:meth:`~xarray.Dataset.rolling_exp` added, similar to pandas' + ``pd.DataFrame.ewm`` method. Calling ``.mean`` on the resulting object + will return an exponentially weighted moving average. + By `Maximilian Roos `_. - Character arrays' character dimension name decoding and encoding handled by ``var.encoding['char_dim_name']`` (:issue:`2895`) By `James McCreight `_. +- :py:meth:`DataArray.transpose` now accepts a keyword argument + ``transpose_coords`` which enables transposition of coordinates in the + same way as :py:meth:`Dataset.transpose`. :py:meth:`DataArray.groupby` + :py:meth:`DataArray.groupby_bins`, and :py:meth:`DataArray.resample` now + accept a keyword argument ``restore_coord_dims`` which keeps the order + of the dimensions of multi-dimensional coordinates intact (:issue:`1856`). + By `Peter Hausamann `_. - Clean up Python 2 compatibility in code (:issue:`2950`) By `Guido Imperiale `_. +- Implement ``load_dataset()`` and ``load_dataarray()`` as alternatives to + ``open_dataset()`` and ``open_dataarray()`` to open, load into memory, + and close files, returning the Dataset or DataArray. These functions are + helpful for avoiding file-lock errors when trying to write to files opened + using ``open_dataset()`` or ``open_dataarray()``. (:issue:`2887`) + By `Dan Nowacki `_. +- Better warning message when supplying invalid objects to ``xr.merge`` + (:issue:`2948`). By `Mathias Hauser `_. +- Added ``strftime`` method to ``.dt`` accessor, making it simpler to hand a + datetime ``DataArray`` to other code expecting formatted dates and times. + (:issue:`2090`). By `Alan Brammer `_ and + `Ryan May `_. +- Like :py:class:`pandas.DatetimeIndex`, :py:class:`CFTimeIndex` now supports a + :py:meth:`~xarray.CFTimeIndex.strftime` method to return an index of string + formatted datetimes. By `Alan Brammer `_. +- Add ``.str`` accessor to DataArrays for string related manipulations. + By `0x0L `_. +- Add ``errors`` keyword argument to :py:meth:`Dataset.drop` and :py:meth:`Dataset.drop_dims` + that allows ignoring errors if a passed label or dimension is not in the dataset + (:issue:`2994`). + By `Andrew Ross `_. + Bug fixes ~~~~~~~~~ @@ -37,14 +80,27 @@ Bug fixes :py:meth:`xr.open_mfdataset` sets variable encodings to that of variables in first file.(:issue:`2436`, :issue:`2921`) By `Deepak Cherian `_. +- NetCDF4 output: variables with unlimited dimensions must be chunked (not + contiguous) on output. (:issue:`1849`) + By `James McCreight `_. - indexing with an empty list creates an object with zero-length axis (:issue:`2882`) By `Mayeul d'Avezac `_. - Return correct count for scalar datetime64 arrays (:issue:`2770`) By `Dan Nowacki `_. -- Fix facetgrid colormap bug when ``extend=True``. (:issue:`2932`) - By `Deepak Cherian `_. +- Increased support for `missing_value` (:issue:`2871`) + By `Deepak Cherian `_. +- Removed usages of `pytest.config`, which is deprecated (:issue:`2988`) + By `Maximilian Roos `_. +- Fixed performance issues with cftime installed (:issue:`3000`) + By `0x0L `_. +- Replace incorrect usages of `message` in pytest assertions + with `match` (:issue:`3011`) + By `Maximilian Roos `_. +- Add explicit pytest markers, now required by pytest + (:issue:`3032`). + By `Maximilian Roos `_. .. _whats-new.0.12.1: @@ -142,6 +198,7 @@ Other enhancements - Upsampling an array via interpolation with resample is now dask-compatible, as long as the array is not chunked along the resampling dimension. By `Spencer Clark `_. + - :py:func:`xarray.testing.assert_equal` and :py:func:`xarray.testing.assert_identical` now provide a more detailed report showing what exactly differs between the two objects (dimensions / @@ -157,9 +214,9 @@ Other enhancements By `Keisuke Fujii `_. - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. -- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` - parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for - backwards compatibility. The ``overwrite_encoded_chunks`` parameter is +- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` + parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for + backwards compatibility. The ``overwrite_encoded_chunks`` parameter is added to remove the original zarr chunk encoding. By `Lily Wang `_. @@ -691,7 +748,7 @@ Enhancements arguments in ``data_vars`` to indexes set explicitly in ``coords``, where previously an error would be raised. (:issue:`674`) - By `Maximilian Roos `_. + By `Maximilian Roos `_. - :py:meth:`~DataArray.sel`, :py:meth:`~DataArray.isel` & :py:meth:`~DataArray.reindex`, (and their :py:class:`Dataset` counterparts) now support supplying a ``dict`` @@ -699,12 +756,12 @@ Enhancements of supplying `kwargs`. This allows for more robust behavior of dimension names which conflict with other keyword names, or are not strings. - By `Maximilian Roos `_. + By `Maximilian Roos `_. - :py:meth:`~DataArray.rename` now supports supplying ``**kwargs``, as an alternative to the existing approach of supplying a ``dict`` as the first argument. - By `Maximilian Roos `_. + By `Maximilian Roos `_. - :py:meth:`~DataArray.cumsum` and :py:meth:`~DataArray.cumprod` now support aggregation over multiple dimensions at the same time. This is the default @@ -869,7 +926,7 @@ Enhancements which test each value in the array for whether it is contained in the supplied list, returning a bool array. See :ref:`selecting values with isin` for full details. Similar to the ``np.isin`` function. - By `Maximilian Roos `_. + By `Maximilian Roos `_. - Some speed improvement to construct :py:class:`~xarray.DataArrayRolling` object (:issue:`1993`) By `Keisuke Fujii `_. @@ -2064,7 +2121,7 @@ Enhancements ~~~~~~~~~~~~ - New documentation on :ref:`panel transition`. By - `Maximilian Roos `_. + `Maximilian Roos `_. - New ``Dataset`` and ``DataArray`` methods :py:meth:`~xarray.Dataset.to_dict` and :py:meth:`~xarray.Dataset.from_dict` to allow easy conversion between dictionaries and xarray objects (:issue:`432`). See @@ -2085,9 +2142,9 @@ Bug fixes (:issue:`953`). By `Stephan Hoyer `_. - ``Dataset.__dir__()`` (i.e. the method python calls to get autocomplete options) failed if one of the dataset's keys was not a string (:issue:`852`). - By `Maximilian Roos `_. + By `Maximilian Roos `_. - ``Dataset`` constructor can now take arbitrary objects as values - (:issue:`647`). By `Maximilian Roos `_. + (:issue:`647`). By `Maximilian Roos `_. - Clarified ``copy`` argument for :py:meth:`~xarray.DataArray.reindex` and :py:func:`~xarray.align`, which now consistently always return new xarray objects (:issue:`927`). diff --git a/setup.cfg b/setup.cfg index 18922b1647a..bfa49118d84 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,11 +9,23 @@ filterwarnings = ignore:Using a non-tuple sequence for multidimensional indexing is deprecated:FutureWarning env = UVCDAT_ANONYMOUS_LOG=no +markers = + flaky: flaky tests + network: tests requiring a network connection + slow: slow tests -# This should be kept in sync with .pep8speaks.yml -[pycodestyle] +[flake8] max-line-length=79 -ignore=E402,E731,E741,W503,W504 +ignore= + E402 + E731 + E741 + W503 + W504 + # Unused imports; TODO: Allow typing to work without triggering errors + F401 +exclude= + doc [isort] default_section=THIRDPARTY @@ -21,6 +33,8 @@ known_first_party=xarray multi_line_output=4 # Most of the numerical computing stack doesn't have type annotations yet. +[mypy-affine.*] +ignore_missing_imports = True [mypy-bottleneck.*] ignore_missing_imports = True [mypy-cdms2.*] @@ -47,6 +61,8 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-nc_time_axis.*] ignore_missing_imports = True +[mypy-numbagg.*] +ignore_missing_imports = True [mypy-numpy.*] ignore_missing_imports = True [mypy-netCDF4.*] @@ -72,6 +88,12 @@ ignore_missing_imports = True [mypy-zarr.*] ignore_missing_imports = True +# setuptools is not typed +[mypy-setup] +ignore_errors = True +# versioneer code +[mypy-versioneer.*] +ignore_errors = True # written by versioneer [mypy-xarray._version] ignore_errors = True diff --git a/xarray/__init__.py b/xarray/__init__.py index 773dfe19d01..9eaa705e108 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -1,3 +1,4 @@ +""" isort:skip_file """ # flake8: noqa from ._version import get_versions @@ -17,7 +18,7 @@ from .core.options import set_options from .backends.api import (open_dataset, open_dataarray, open_mfdataset, - save_mfdataset) + save_mfdataset, load_dataset, load_dataarray) from .backends.rasterio_ import open_rasterio from .backends.zarr import open_zarr diff --git a/xarray/backends/__init__.py b/xarray/backends/__init__.py index 9b9e04d9346..292a6d68523 100644 --- a/xarray/backends/__init__.py +++ b/xarray/backends/__init__.py @@ -3,16 +3,16 @@ DataStores provide a uniform interface for saving and loading data in different formats. They should not be used directly, but rather through Dataset objects. """ -from .common import AbstractDataStore -from .file_manager import FileManager, CachingFileManager, DummyFileManager from .cfgrib_ import CfGribDataStore +from .common import AbstractDataStore +from .file_manager import CachingFileManager, DummyFileManager, FileManager +from .h5netcdf_ import H5NetCDFStore from .memory import InMemoryDataStore from .netCDF4_ import NetCDF4DataStore +from .pseudonetcdf_ import PseudoNetCDFDataStore from .pydap_ import PydapDataStore from .pynio_ import NioDataStore from .scipy_ import ScipyDataStore -from .h5netcdf_ import H5NetCDFStore -from .pseudonetcdf_ import PseudoNetCDFDataStore from .zarr import ZarrStore __all__ = [ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 7c5040580fe..01188e92752 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -185,12 +185,64 @@ def _finalize_store(write, store): store.close() +def load_dataset(filename_or_obj, **kwargs): + """Open, load into memory, and close a Dataset from a file or file-like + object. + + This is a thin wrapper around :py:meth:`~xarray.open_dataset`. It differs + from `open_dataset` in that it loads the Dataset into memory, closes the + file, and returns the Dataset. In contrast, `open_dataset` keeps the file + handle open and lazy loads its contents. All parameters are passed directly + to `open_dataset`. See that documentation for further details. + + Returns + ------- + dataset : Dataset + The newly created Dataset. + + See Also + -------- + open_dataset + """ + if 'cache' in kwargs: + raise TypeError('cache has no effect in this context') + + with open_dataset(filename_or_obj, **kwargs) as ds: + return ds.load() + + +def load_dataarray(filename_or_obj, **kwargs): + """Open, load into memory, and close a DataArray from a file or file-like + object containing a single data variable. + + This is a thin wrapper around :py:meth:`~xarray.open_dataarray`. It differs + from `open_dataarray` in that it loads the Dataset into memory, closes the + file, and returns the Dataset. In contrast, `open_dataarray` keeps the file + handle open and lazy loads its contents. All parameters are passed directly + to `open_dataarray`. See that documentation for further details. + + Returns + ------- + datarray : DataArray + The newly created DataArray. + + See Also + -------- + open_dataarray + """ + if 'cache' in kwargs: + raise TypeError('cache has no effect in this context') + + with open_dataarray(filename_or_obj, **kwargs) as da: + return da.load() + + def open_dataset(filename_or_obj, group=None, decode_cf=True, mask_and_scale=None, decode_times=True, autoclose=None, concat_characters=True, decode_coords=True, engine=None, chunks=None, lock=None, cache=None, drop_variables=None, backend_kwargs=None, use_cftime=None): - """Load and decode a dataset from a file or file-like object. + """Open and decode a dataset from a file or file-like object. Parameters ---------- @@ -406,7 +458,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, concat_characters=True, decode_coords=True, engine=None, chunks=None, lock=None, cache=None, drop_variables=None, backend_kwargs=None, use_cftime=None): - """Open an DataArray from a netCDF file containing a single data variable. + """Open an DataArray from a file or file-like object containing a single + data variable. This is designed to read netCDF files with only one data variable. If multiple variables are present then a ValueError is raised. diff --git a/xarray/backends/file_manager.py b/xarray/backends/file_manager.py index 5955ef54d6e..0d11632fa67 100644 --- a/xarray/backends/file_manager.py +++ b/xarray/backends/file_manager.py @@ -1,7 +1,7 @@ import contextlib import threading -from typing import Any, Dict import warnings +from typing import Any, Dict from ..core import utils from ..core.options import OPTIONS diff --git a/xarray/backends/locks.py b/xarray/backends/locks.py index 65150562538..bb63186ce3a 100644 --- a/xarray/backends/locks.py +++ b/xarray/backends/locks.py @@ -1,7 +1,7 @@ import multiprocessing import threading -from typing import Any, MutableMapping import weakref +from typing import Any, MutableMapping try: from dask.utils import SerializableLock diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index b3bab9617ee..268afcfcea5 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -174,7 +174,7 @@ def _force_native_endianness(var): # if endian exists, remove it from the encoding. var.encoding.pop('endian', None) # check to see if encoding has a value for endian its 'native' - if not var.encoding.get('endian', 'native') is 'native': + if not var.encoding.get('endian', 'native') == 'native': raise NotImplementedError("Attempt to write non-native endian type, " "this is not supported by the netCDF4 " "python library.") @@ -206,10 +206,17 @@ def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, chunks_too_big = any( c > d and dim not in unlimited_dims for c, d, dim in zip(chunksizes, variable.shape, variable.dims)) - changed_shape = encoding.get('original_shape') != variable.shape + has_original_shape = 'original_shape' in encoding + changed_shape = (has_original_shape and + encoding.get('original_shape') != variable.shape) if chunks_too_big or changed_shape: del encoding['chunksizes'] + var_has_unlim_dim = any(dim in unlimited_dims for dim in variable.dims) + if (not raise_on_invalid and var_has_unlim_dim + and 'contiguous' in encoding.keys()): + del encoding['contiguous'] + for k in safe_to_drop: if k in encoding: del encoding[k] @@ -230,6 +237,7 @@ def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, class GroupWrapper: """Wrap netCDF4.Group objects so closing them closes the root group.""" + def __init__(self, value): self.value = value @@ -445,6 +453,7 @@ def prepare_variable(self, name, variable, check_encoding=False, encoding = _extract_nc4_variable_encoding( variable, raise_on_invalid=check_encoding, unlimited_dims=unlimited_dims) + if name in self.ds.variables: nc4_var = self.ds.variables[name] else: diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 1456f8ce3b3..cf10d6238aa 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -184,7 +184,7 @@ def get_date_type(self): def assert_all_valid_date_type(data): import cftime - if data.size: + if len(data) > 0: sample = data[0] date_type = type(sample) if not isinstance(sample, cftime.datetime): @@ -229,12 +229,12 @@ class CFTimeIndex(pd.Index): date_type = property(get_date_type) def __new__(cls, data, name=None): + assert_all_valid_date_type(data) if name is None and hasattr(data, 'name'): name = data.name result = object.__new__(cls) result._data = np.array(data, dtype='O') - assert_all_valid_date_type(result._data) result.name = name return result @@ -476,6 +476,35 @@ def to_datetimeindex(self, unsafe=False): 'dates.'.format(calendar), RuntimeWarning, stacklevel=2) return pd.DatetimeIndex(nptimes) + def strftime(self, date_format): + """ + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format doc + `__ + + Parameters + ---------- + date_format : str + Date format string (e.g. "%Y-%m-%d") + + Returns + ------- + Index + Index of formatted strings + + Examples + -------- + >>> rng = xr.cftime_range(start='2000', periods=5, freq='2MS', + ... calendar='noleap') + >>> rng.strftime('%B %d, %Y, %r') + Index(['January 01, 2000, 12:00:00 AM', 'March 01, 2000, 12:00:00 AM', + 'May 01, 2000, 12:00:00 AM', 'July 01, 2000, 12:00:00 AM', + 'September 01, 2000, 12:00:00 AM'], + dtype='object') + """ + return pd.Index([date.strftime(date_format) for date in self._data]) + def _parse_iso8601_without_reso(date_type, datetime_str): date, _ = _parse_iso8601_with_reso(date_type, datetime_str) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index ae8b97c7352..c23e45e44de 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -1,13 +1,14 @@ """Coders for individual Variable objects.""" -from typing import Any import warnings from functools import partial +from typing import Any import numpy as np import pandas as pd from ..core import dtypes, duck_array_ops, indexing from ..core.pycompat import dask_array_type +from ..core.utils import equivalent from ..core.variable import Variable @@ -145,11 +146,24 @@ class CFMaskCoder(VariableCoder): def encode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_encoding(variable) - if encoding.get('_FillValue') is not None: + fv = encoding.get('_FillValue') + mv = encoding.get('missing_value') + + if fv is not None and mv is not None and not equivalent(fv, mv): + raise ValueError("Variable {!r} has multiple fill values {}. " + "Cannot encode data. " + .format(name, [fv, mv])) + + if fv is not None: fill_value = pop_to(encoding, attrs, '_FillValue', name=name) if not pd.isnull(fill_value): data = duck_array_ops.fillna(data, fill_value) + if mv is not None: + fill_value = pop_to(encoding, attrs, 'missing_value', name=name) + if not pd.isnull(fill_value) and fv is None: + data = duck_array_ops.fillna(data, fill_value) + return Variable(dims, data, attrs, encoding) def decode(self, variable, name=None): diff --git a/xarray/conventions.py b/xarray/conventions.py index 42ad800ea48..d0d90242426 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -83,7 +83,8 @@ def maybe_encode_nonstring_dtype(var, name=None): if dtype != var.dtype: if np.issubdtype(dtype, np.integer): if (np.issubdtype(var.dtype, np.floating) and - '_FillValue' not in var.attrs): + '_FillValue' not in var.attrs and + 'missing_value' not in var.attrs): warnings.warn('saving variable %s with floating ' 'point data as an integer dtype without ' 'any _FillValue to use for NaNs' % name, diff --git a/xarray/core/accessors.py b/xarray/core/accessor_dt.py similarity index 78% rename from xarray/core/accessors.py rename to xarray/core/accessor_dt.py index 640060fafe5..01cddae188f 100644 --- a/xarray/core/accessors.py +++ b/xarray/core/accessor_dt.py @@ -110,6 +110,38 @@ def _round_field(values, name, freq): return _round_series(values, name, freq) +def _strftime_through_cftimeindex(values, date_format): + """Coerce an array of cftime-like values to a CFTimeIndex + and access requested datetime component + """ + from ..coding.cftimeindex import CFTimeIndex + values_as_cftimeindex = CFTimeIndex(values.ravel()) + + field_values = values_as_cftimeindex.strftime(date_format) + return field_values.values.reshape(values.shape) + + +def _strftime_through_series(values, date_format): + """Coerce an array of datetime-like values to a pandas Series and + apply string formatting + """ + values_as_series = pd.Series(values.ravel()) + strs = values_as_series.dt.strftime(date_format) + return strs.values.reshape(values.shape) + + +def _strftime(values, date_format): + if is_np_datetime_like(values.dtype): + access_method = _strftime_through_series + else: + access_method = _strftime_through_cftimeindex + if isinstance(values, dask_array_type): + from dask.array import map_blocks + return map_blocks(access_method, values, date_format) + else: + return access_method(values, date_format) + + class DatetimeAccessor: """Access datetime fields for DataArrays with datetime-like dtypes. @@ -133,13 +165,13 @@ class DatetimeAccessor: """ - def __init__(self, xarray_obj): - if not _contains_datetime_like_objects(xarray_obj): + def __init__(self, obj): + if not _contains_datetime_like_objects(obj): raise TypeError("'dt' accessor only available for " "DataArray with datetime64 timedelta64 dtype or " "for arrays containing cftime datetime " "objects.") - self._obj = xarray_obj + self._obj = obj def _tslib_field_accessor(name, docstring=None, dtype=None): def f(self, dtype=dtype): @@ -256,3 +288,39 @@ def round(self, freq): Array-like of datetime fields accessed for each element in values ''' return self._tslib_round_accessor("round", freq) + + def strftime(self, date_format): + ''' + Return an array of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format doc + `__ + + Parameters + ---------- + date_format : str + date format string (e.g. "%Y-%m-%d") + + Returns + ------- + formatted strings : same type as values + Array-like of strings formatted for each element in values + + Examples + -------- + >>> rng = xr.Dataset({'time': datetime.datetime(2000, 1, 1)}) + >>> rng['time'].dt.strftime('%B %d, %Y, %r') + + array('January 01, 2000, 12:00:00 AM', dtype=object) + """ + + ''' + obj_type = type(self._obj) + + result = _strftime(self._obj.data, date_format) + + return obj_type( + result, + name="strftime", + coords=self._obj.coords, + dims=self._obj.dims) diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py new file mode 100644 index 00000000000..4a1983517eb --- /dev/null +++ b/xarray/core/accessor_str.py @@ -0,0 +1,957 @@ +# The StringAccessor class defined below is an adaptation of the +# pandas string methods source code (see pd.core.strings) + +# For reference, here is a copy of the pandas copyright notice: + +# (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +# All rights reserved. + +# Copyright (c) 2008-2011 AQR Capital Management, LLC +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. + +# * Neither the name of the copyright holder nor the names of any +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import codecs +import re +import textwrap + +import numpy as np + +from .computation import apply_ufunc + +_cpython_optimized_encoders = ( + "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" +) +_cpython_optimized_decoders = _cpython_optimized_encoders + ( + "utf-16", "utf-32" +) + + +def _is_str_like(x): + return isinstance(x, str) or isinstance(x, bytes) + + +class StringAccessor: + """Vectorized string functions for string-like arrays. + + Similar to pandas, fields can be accessed through the `.str` attribute + for applicable DataArrays. + + >>> da = xr.DataArray(['some', 'text', 'in', 'an', 'array']) + >>> ds.str.len() + + array([4, 4, 2, 2, 5]) + Dimensions without coordinates: dim_0 + + """ + + def __init__(self, obj): + self._obj = obj + + def _apply(self, f, dtype=None): + # TODO handling of na values ? + if dtype is None: + dtype = self._obj.dtype + + g = np.vectorize(f, otypes=[dtype]) + return apply_ufunc( + g, self._obj, dask='parallelized', output_dtypes=[dtype]) + + def len(self): + ''' + Compute the length of each element in the array. + + Returns + ------- + lengths array : array of int + ''' + return self._apply(len, dtype=int) + + def __getitem__(self, key): + if isinstance(key, slice): + return self.slice(start=key.start, stop=key.stop, step=key.step) + else: + return self.get(key) + + def get(self, i): + ''' + Extract element from indexable in each element in the array. + + Parameters + ---------- + i : int + Position of element to extract. + default : optional + Value for out-of-range index. If not specified (None) defaults to + an empty string. + + Returns + ------- + items : array of objects + ''' + obj = slice(-1, None) if i == -1 else slice(i, i + 1) + return self._apply(lambda x: x[obj]) + + def slice(self, start=None, stop=None, step=None): + ''' + Slice substrings from each element in the array. + + Parameters + ---------- + start : int, optional + Start position for slice operation. + stop : int, optional + Stop position for slice operation. + step : int, optional + Step size for slice operation. + + Returns + ------- + sliced strings : same type as values + ''' + s = slice(start, stop, step) + f = lambda x: x[s] + return self._apply(f) + + def slice_replace(self, start=None, stop=None, repl=''): + ''' + Replace a positional slice of a string with another value. + + Parameters + ---------- + start : int, optional + Left index position to use for the slice. If not specified (None), + the slice is unbounded on the left, i.e. slice from the start + of the string. + stop : int, optional + Right index position to use for the slice. If not specified (None), + the slice is unbounded on the right, i.e. slice until the + end of the string. + repl : str, optional + String for replacement. If not specified, the sliced region + is replaced with an empty string. + + Returns + ------- + replaced : same type as values + ''' + repl = self._obj.dtype.type(repl) + + def f(x): + if len(x[start:stop]) == 0: + local_stop = start + else: + local_stop = stop + y = self._obj.dtype.type('') + if start is not None: + y += x[:start] + y += repl + if stop is not None: + y += x[local_stop:] + return y + + return self._apply(f) + + def capitalize(self): + ''' + Convert strings in the array to be capitalized. + + Returns + ------- + capitalized : same type as values + ''' + return self._apply(lambda x: x.capitalize()) + + def lower(self): + ''' + Convert strings in the array to lowercase. + + Returns + ------- + lowerd : same type as values + ''' + return self._apply(lambda x: x.lower()) + + def swapcase(self): + ''' + Convert strings in the array to be swapcased. + + Returns + ------- + swapcased : same type as values + ''' + return self._apply(lambda x: x.swapcase()) + + def title(self): + ''' + Convert strings in the array to titlecase. + + Returns + ------- + titled : same type as values + ''' + return self._apply(lambda x: x.title()) + + def upper(self): + ''' + Convert strings in the array to uppercase. + + Returns + ------- + uppered : same type as values + ''' + return self._apply(lambda x: x.upper()) + + def isalnum(self): + ''' + Check whether all characters in each string are alphanumeric. + + Returns + ------- + isalnum : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isalnum(), dtype=bool) + + def isalpha(self): + ''' + Check whether all characters in each string are alphabetic. + + Returns + ------- + isalpha : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isalpha(), dtype=bool) + + def isdecimal(self): + ''' + Check whether all characters in each string are decimal. + + Returns + ------- + isdecimal : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isdecimal(), dtype=bool) + + def isdigit(self): + ''' + Check whether all characters in each string are digits. + + Returns + ------- + isdigit : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isdigit(), dtype=bool) + + def islower(self): + ''' + Check whether all characters in each string are lowercase. + + Returns + ------- + islower : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.islower(), dtype=bool) + + def isnumeric(self): + ''' + Check whether all characters in each string are numeric. + + Returns + ------- + isnumeric : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isnumeric(), dtype=bool) + + def isspace(self): + ''' + Check whether all characters in each string are spaces. + + Returns + ------- + isspace : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isspace(), dtype=bool) + + def istitle(self): + ''' + Check whether all characters in each string are titlecase. + + Returns + ------- + istitle : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.istitle(), dtype=bool) + + def isupper(self): + ''' + Check whether all characters in each string are uppercase. + + Returns + ------- + isupper : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isupper(), dtype=bool) + + def count(self, pat, flags=0): + ''' + Count occurrences of pattern in each string of the array. + + This function is used to count the number of times a particular regex + pattern is repeated in each of the string elements of the + :class:`~xarray.DatArray`. + + Parameters + ---------- + pat : str + Valid regular expression. + flags : int, default 0, meaning no flags + Flags for the `re` module. For a complete list, `see here + `_. + + Returns + ------- + counts : array of int + ''' + pat = self._obj.dtype.type(pat) + regex = re.compile(pat, flags=flags) + f = lambda x: len(regex.findall(x)) + return self._apply(f, dtype=int) + + def startswith(self, pat): + ''' + Test if the start of each string element matches a pattern. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + + Returns + ------- + startswith : array of bool + An array of booleans indicating whether the given pattern matches + the start of each string element. + ''' + pat = self._obj.dtype.type(pat) + f = lambda x: x.startswith(pat) + return self._apply(f, dtype=bool) + + def endswith(self, pat): + ''' + Test if the end of each string element matches a pattern. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + + Returns + ------- + endswith : array of bool + A Series of booleans indicating whether the given pattern matches + the end of each string element. + ''' + pat = self._obj.dtype.type(pat) + f = lambda x: x.endswith(pat) + return self._apply(f, dtype=bool) + + def pad(self, width, side='left', fillchar=' '): + ''' + Pad strings in the array up to width. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be + filled with character defined in `fillchar`. + side : {'left', 'right', 'both'}, default 'left' + Side from which to fill resulting string. + fillchar : str, default ' ' + Additional character for filling, default is whitespace. + + Returns + ------- + filled : same type as values + Array with a minimum number of char in each element. + ''' + width = int(width) + fillchar = self._obj.dtype.type(fillchar) + if len(fillchar) != 1: + raise TypeError('fillchar must be a character, not str') + + if side == 'left': + f = lambda s: s.rjust(width, fillchar) + elif side == 'right': + f = lambda s: s.ljust(width, fillchar) + elif side == 'both': + f = lambda s: s.center(width, fillchar) + else: # pragma: no cover + raise ValueError('Invalid side') + + return self._apply(f) + + def center(self, width, fillchar=' '): + ''' + Filling left and right side of strings in the array with an + additional character. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be + filled with ``fillchar`` + fillchar : str + Additional character for filling, default is whitespace + + Returns + ------- + filled : same type as values + ''' + return self.pad(width, side='both', fillchar=fillchar) + + def ljust(self, width, fillchar=' '): + ''' + Filling right side of strings in the array with an additional + character. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be + filled with ``fillchar`` + fillchar : str + Additional character for filling, default is whitespace + + Returns + ------- + filled : same type as values + ''' + return self.pad(width, side='right', fillchar=fillchar) + + def rjust(self, width, fillchar=' '): + ''' + Filling left side of strings in the array with an additional character. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be + filled with ``fillchar`` + fillchar : str + Additional character for filling, default is whitespace + + Returns + ------- + filled : same type as values + ''' + return self.pad(width, side='left', fillchar=fillchar) + + def zfill(self, width): + ''' + Pad strings in the array by prepending '0' characters. + + Strings in the array are padded with '0' characters on the + left of the string to reach a total string length `width`. Strings + in the array with length greater or equal to `width` are unchanged. + + Parameters + ---------- + width : int + Minimum length of resulting string; strings with length less + than `width` be prepended with '0' characters. + + Returns + ------- + filled : same type as values + ''' + return self.pad(width, side='left', fillchar='0') + + def contains(self, pat, case=True, flags=0, regex=True): + ''' + Test if pattern or regex is contained within a string of the array. + + Return boolean array based on whether a given pattern or regex is + contained within a string of the array. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Flags to pass through to the re module, e.g. re.IGNORECASE. + regex : bool, default True + If True, assumes the pat is a regular expression. + If False, treats the pat as a literal string. + + Returns + ------- + contains : array of bool + An array of boolean values indicating whether the + given pattern is contained within the string of each element + of the array. + ''' + pat = self._obj.dtype.type(pat) + if regex: + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + if regex.groups > 0: # pragma: no cover + raise ValueError("This pattern has match groups.") + + f = lambda x: bool(regex.search(x)) + else: + if case: + f = lambda x: pat in x + else: + uppered = self._obj.str.upper() + return uppered.str.contains(pat.upper(), regex=False) + + return self._apply(f, dtype=bool) + + def match(self, pat, case=True, flags=0): + ''' + Determine if each string matches a regular expression. + + Parameters + ---------- + pat : string + Character sequence or regular expression + case : boolean, default True + If True, case sensitive + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE + + Returns + ------- + matched : array of bool + ''' + if not case: + flags |= re.IGNORECASE + + pat = self._obj.dtype.type(pat) + regex = re.compile(pat, flags=flags) + f = lambda x: bool(regex.match(x)) + return self._apply(f, dtype=bool) + + def strip(self, to_strip=None, side='both'): + ''' + Remove leading and trailing characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the array from left and/or right sides. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + side : {'left', 'right', 'both'}, default 'left' + Side from which to strip. + + Returns + ------- + stripped : same type as values + ''' + if to_strip is not None: + to_strip = self._obj.dtype.type(to_strip) + + if side == 'both': + f = lambda x: x.strip(to_strip) + elif side == 'left': + f = lambda x: x.lstrip(to_strip) + elif side == 'right': + f = lambda x: x.rstrip(to_strip) + else: # pragma: no cover + raise ValueError('Invalid side') + + return self._apply(f) + + def lstrip(self, to_strip=None): + ''' + Remove leading and trailing characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the array from the left side. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + + Returns + ------- + stripped : same type as values + ''' + return self.strip(to_strip, side='left') + + def rstrip(self, to_strip=None): + ''' + Remove leading and trailing characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the array from the right side. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + + Returns + ------- + stripped : same type as values + ''' + return self.strip(to_strip, side='right') + + def wrap(self, width, **kwargs): + ''' + Wrap long strings in the array to be formatted in paragraphs with + length less than a given width. + + This method has the same keyword parameters and defaults as + :class:`textwrap.TextWrapper`. + + Parameters + ---------- + width : int + Maximum line-width + expand_tabs : bool, optional + If true, tab characters will be expanded to spaces (default: True) + replace_whitespace : bool, optional + If true, each whitespace character (as defined by + string.whitespace) remaining after tab expansion will be replaced + by a single space (default: True) + drop_whitespace : bool, optional + If true, whitespace that, after wrapping, happens to end up at the + beginning or end of a line is dropped (default: True) + break_long_words : bool, optional + If true, then words longer than width will be broken in order to + ensure that no lines are longer than width. If it is false, long + words will not be broken, and some lines may be longer than width. + (default: True) + break_on_hyphens : bool, optional + If true, wrapping will occur preferably on whitespace and right + after hyphens in compound words, as it is customary in English. If + false, only whitespaces will be considered as potentially good + places for line breaks, but you need to set break_long_words to + false if you want truly insecable words. (default: True) + + Returns + ------- + wrapped : same type as values + ''' + tw = textwrap.TextWrapper(width=width) + f = lambda x: '\n'.join(tw.wrap(x)) + return self._apply(f) + + def translate(self, table): + ''' + Map all characters in the string through the given mapping table. + + Parameters + ---------- + table : dict + A a mapping of Unicode ordinals to Unicode ordinals, strings, + or None. Unmapped characters are left untouched. Characters mapped + to None are deleted. :meth:`str.maketrans` is a helper function for + making translation tables. + + Returns + ------- + translated : same type as values + ''' + f = lambda x: x.translate(table) + return self._apply(f) + + def repeat(self, repeats): + ''' + Duplicate each string in the array. + + Parameters + ---------- + repeats : int + Number of repetitions. + + Returns + ------- + repeated : same type as values + Array of repeated string objects. + ''' + f = lambda x: repeats * x + return self._apply(f) + + def find(self, sub, start=0, end=None, side='left'): + ''' + Return lowest or highest indexes in each strings in the array + where the substring is fully contained between [start:end]. + Return -1 on failure. + + Parameters + ---------- + sub : str + Substring being searched + start : int + Left edge index + end : int + Right edge index + side : {'left', 'right'}, default 'left' + Starting side for search. + + Returns + ------- + found : array of integer values + ''' + sub = self._obj.dtype.type(sub) + + if side == 'left': + method = 'find' + elif side == 'right': + method = 'rfind' + else: # pragma: no cover + raise ValueError('Invalid side') + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + + return self._apply(f, dtype=int) + + def rfind(self, sub, start=0, end=None): + ''' + Return highest indexes in each strings in the array + where the substring is fully contained between [start:end]. + Return -1 on failure. + + Parameters + ---------- + sub : str + Substring being searched + start : int + Left edge index + end : int + Right edge index + + Returns + ------- + found : array of integer values + ''' + return self.find(sub, start=start, end=end, side='right') + + def index(self, sub, start=0, end=None, side='left'): + ''' + Return lowest or highest indexes in each strings where the substring is + fully contained between [start:end]. This is the same as + ``str.find`` except instead of returning -1, it raises a ValueError + when the substring is not found. + + Parameters + ---------- + sub : str + Substring being searched + start : int + Left edge index + end : int + Right edge index + side : {'left', 'right'}, default 'left' + Starting side for search. + + Returns + ------- + found : array of integer values + ''' + sub = self._obj.dtype.type(sub) + + if side == 'left': + method = 'index' + elif side == 'right': + method = 'rindex' + else: # pragma: no cover + raise ValueError('Invalid side') + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + + return self._apply(f, dtype=int) + + def rindex(self, sub, start=0, end=None): + ''' + Return highest indexes in each strings where the substring is + fully contained between [start:end]. This is the same as + ``str.rfind`` except instead of returning -1, it raises a ValueError + when the substring is not found. + + Parameters + ---------- + sub : str + Substring being searched + start : int + Left edge index + end : int + Right edge index + + Returns + ------- + found : array of integer values + ''' + return self.index(sub, start=start, end=end, side='right') + + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + ''' + Replace occurrences of pattern/regex in the array with some string. + + Parameters + ---------- + pat : string or compiled regex + String can be a character sequence or regular expression. + + repl : string or callable + Replacement string or a callable. The callable is passed the regex + match object and must return a replacement string to be used. + See :func:`re.sub`. + + n : int, default -1 (all) + Number of replacements to make from start + case : boolean, default None + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex + flags : int, default 0 (no flags) + - re module flags, e.g. re.IGNORECASE + - Cannot be set if `pat` is a compiled regex + regex : boolean, default True + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + Returns + ------- + replaced : same type as values + A copy of the object with all matching occurrences of `pat` + replaced by `repl`. + ''' + if not (_is_str_like(repl) or callable(repl)): # pragma: no cover + raise TypeError("repl must be a string or callable") + + if _is_str_like(pat): + pat = self._obj.dtype.type(pat) + + if _is_str_like(repl): + repl = self._obj.dtype.type(repl) + + is_compiled_re = isinstance(pat, type(re.compile(''))) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError("case and flags cannot be set" + " when pat is a compiled regex") + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: + flags |= re.IGNORECASE + if is_compiled_re or len(pat) > 1 or flags or callable(repl): + n = n if n >= 0 else 0 + compiled = re.compile(pat, flags=flags) + f = lambda x: compiled.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) + else: + if is_compiled_re: + raise ValueError("Cannot use a compiled regex as replacement " + "pattern with regex=False") + if callable(repl): + raise ValueError("Cannot use a callable replacement when " + "regex=False") + f = lambda x: x.replace(pat, repl, n) + return self._apply(f) + + def decode(self, encoding, errors='strict'): + ''' + Decode character string in the array using indicated encoding. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + decoded : same type as values + ''' + if encoding in _cpython_optimized_decoders: + f = lambda x: x.decode(encoding, errors) + else: + decoder = codecs.getdecoder(encoding) + f = lambda x: decoder(x, errors)[0] + return self._apply(f, dtype=np.str_) + + def encode(self, encoding, errors='strict'): + ''' + Encode character string in the array using indicated encoding. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + encoded : same type as values + ''' + if encoding in _cpython_optimized_encoders: + f = lambda x: x.encode(encoding, errors) + else: + encoder = codecs.getencoder(encoding) + f = lambda x: encoder(x, errors)[0] + return self._apply(f, dtype=np.bytes_) diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 295f69a2afc..031861b0ccf 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from . import utils, dtypes +from . import dtypes, utils from .indexing import get_indexer_nd from .utils import is_dict_like, is_full_slice from .variable import IndexVariable, Variable diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 1abd14cd20b..0b18aa47dee 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -4,17 +4,17 @@ import pandas as pd -from . import utils +from . import dtypes, utils from .alignment import align +from .computation import result_name from .merge import merge from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars -from .computation import result_name def concat(objs, dim=None, data_vars='all', coords='different', compat='equals', positions=None, indexers=None, mode=None, - concat_over=None): + concat_over=None, fill_value=dtypes.NA): """Concatenate xarray objects along a new or existing dimension. Parameters @@ -66,6 +66,8 @@ def concat(objs, dim=None, data_vars='all', coords='different', List of integer arrays which specifies the integer positions to which to assign each dataset along the concatenated dimension. If not supplied, objects are concatenated in the provided order. + fill_value : scalar, optional + Value to use for newly missing values indexers, mode, concat_over : deprecated Returns @@ -117,7 +119,7 @@ def concat(objs, dim=None, data_vars='all', coords='different', else: raise TypeError('can only concatenate xarray Dataset and DataArray ' 'objects, got %s' % type(first_obj)) - return f(objs, dim, data_vars, coords, compat, positions) + return f(objs, dim, data_vars, coords, compat, positions, fill_value) def _calc_concat_dim_coord(dim): @@ -212,7 +214,8 @@ def process_subset_opt(opt, subset): return concat_over, equals -def _dataset_concat(datasets, dim, data_vars, coords, compat, positions): +def _dataset_concat(datasets, dim, data_vars, coords, compat, positions, + fill_value=dtypes.NA): """ Concatenate a sequence of datasets along a new or existing dimension """ @@ -225,7 +228,8 @@ def _dataset_concat(datasets, dim, data_vars, coords, compat, positions): dim, coord = _calc_concat_dim_coord(dim) # Make sure we're working on a copy (we'll be loading variables) datasets = [ds.copy() for ds in datasets] - datasets = align(*datasets, join='outer', copy=False, exclude=[dim]) + datasets = align(*datasets, join='outer', copy=False, exclude=[dim], + fill_value=fill_value) concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords) @@ -317,7 +321,7 @@ def ensure_common_dims(vars): def _dataarray_concat(arrays, dim, data_vars, coords, compat, - positions): + positions, fill_value=dtypes.NA): arrays = list(arrays) if data_vars != 'all': @@ -336,14 +340,15 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat, datasets.append(arr._to_temp_dataset()) ds = _dataset_concat(datasets, dim, data_vars, coords, compat, - positions) + positions, fill_value) result = arrays[0]._from_temp_dataset(ds, name) result.name = result_name(arrays) return result -def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): +def _auto_concat(datasets, dim=None, data_vars='all', coords='different', + fill_value=dtypes.NA): if len(datasets) == 1 and dim is None: # There is nothing more to combine, so kick out early. return datasets[0] @@ -366,7 +371,8 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): 'supply the ``concat_dim`` argument ' 'explicitly') dim, = concat_dims - return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) + return concat(datasets, dim=dim, data_vars=data_vars, + coords=coords, fill_value=fill_value) _CONCAT_DIM_DEFAULT = utils.ReprObject('') @@ -442,7 +448,8 @@ def _check_shape_tile_ids(combined_tile_ids): def _combine_nd(combined_ids, concat_dims, data_vars='all', - coords='different', compat='no_conflicts'): + coords='different', compat='no_conflicts', + fill_value=dtypes.NA): """ Concatenates and merges an N-dimensional structure of datasets. @@ -472,13 +479,14 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', dim=concat_dim, data_vars=data_vars, coords=coords, - compat=compat) + compat=compat, + fill_value=fill_value) combined_ds = list(combined_ids.values())[0] return combined_ds def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, - coords, compat): + coords, compat, fill_value=dtypes.NA): # Group into lines of datasets which must be combined along dim # need to sort by _new_tile_id first for groupby to work # TODO remove all these sorted OrderedDicts once python >= 3.6 only @@ -490,7 +498,8 @@ def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, combined_ids = OrderedDict(sorted(group)) datasets = combined_ids.values() new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, - data_vars, coords) + data_vars, coords, + fill_value) return new_combined_ids @@ -500,18 +509,20 @@ def vars_as_keys(ds): def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', - data_vars='all', coords='different'): + data_vars='all', coords='different', + fill_value=dtypes.NA): # This is just the old auto_combine function (which only worked along 1D) if concat_dim is not None: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim sorted_datasets = sorted(datasets, key=vars_as_keys) grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) concatenated = [_auto_concat(list(ds_group), dim=dim, - data_vars=data_vars, coords=coords) + data_vars=data_vars, coords=coords, + fill_value=fill_value) for id, ds_group in grouped_by_vars] else: concatenated = datasets - merged = merge(concatenated, compat=compat) + merged = merge(concatenated, compat=compat, fill_value=fill_value) return merged @@ -521,7 +532,7 @@ def _new_tile_id(single_id_ds_pair): def _auto_combine(datasets, concat_dims, compat, data_vars, coords, - infer_order_from_coords, ids): + infer_order_from_coords, ids, fill_value=dtypes.NA): """ Calls logic to decide concatenation order before concatenating. """ @@ -550,12 +561,14 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, # Repeatedly concatenate then merge along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords) + data_vars=data_vars, coords=coords, + fill_value=fill_value) return combined def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', data_vars='all', coords='different'): + compat='no_conflicts', data_vars='all', coords='different', + fill_value=dtypes.NA): """Attempt to auto-magically combine the given datasets into one. This method attempts to combine a list of datasets into a single entity by inspecting metadata and using a combination of concat and merge. @@ -596,6 +609,8 @@ def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of conca + fill_value : scalar, optional + Value to use for newly missing values Returns ------- @@ -622,4 +637,4 @@ def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords, infer_order_from_coords=infer_order_from_coords, - ids=False) + ids=False, fill_value=fill_value) diff --git a/xarray/core/common.py b/xarray/core/common.py index b518e8431fd..0195be62500 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1,8 +1,9 @@ from collections import OrderedDict from contextlib import suppress from textwrap import dedent -from typing import (Any, Callable, Hashable, Iterable, Iterator, List, Mapping, - MutableMapping, Optional, Tuple, TypeVar, Union) +from typing import ( + Any, Callable, Hashable, Iterable, Iterator, List, Mapping, MutableMapping, + Optional, Tuple, TypeVar, Union) import numpy as np import pandas as pd @@ -11,9 +12,9 @@ from .arithmetic import SupportsArithmetic from .options import _get_keep_attrs from .pycompat import dask_array_type +from .rolling_exp import RollingExp from .utils import Frozen, ReprObject, SortedKeysDict, either_dict_or_kwargs - # Used as a sentinel value to indicate a all dimensions ALL_DIMS = ReprObject('') @@ -86,6 +87,7 @@ def wrapped_func(self, dim=None, **kwargs): # type: ignore class AbstractArray(ImplementsArrayReduce): """Shared base class for DataArray and Variable. """ + def __bool__(self: Any) -> bool: return bool(self.values) @@ -249,6 +251,8 @@ def get_squeeze_dims(xarray_obj, class DataWithCoords(SupportsArithmetic, AttrAccessMixin): """Shared base class for Dataset and DataArray.""" + _rolling_exp_cls = RollingExp + def squeeze(self, dim: Union[Hashable, Iterable[Hashable], None] = None, drop: bool = False, axis: Union[int, Iterable[int], None] = None): @@ -441,7 +445,8 @@ def pipe(self, func: Union[Callable[..., T], Tuple[Callable[..., T], str]], else: return func(self, *args, **kwargs) - def groupby(self, group, squeeze: bool = True): + def groupby(self, group, squeeze: bool = True, + restore_coord_dims: Optional[bool] = None): """Returns a GroupBy object for performing grouped operations. Parameters @@ -453,6 +458,9 @@ def groupby(self, group, squeeze: bool = True): If "group" is a dimension of any arrays in this dataset, `squeeze` controls whether the subarrays have a dimension of length 1 along that dimension or if the dimension is squeezed out. + restore_coord_dims : bool, optional + If True, also restore the dimension order of multi-dimensional + coordinates. Returns ------- @@ -485,11 +493,13 @@ def groupby(self, group, squeeze: bool = True): core.groupby.DataArrayGroupBy core.groupby.DatasetGroupBy """ # noqa - return self._groupby_cls(self, group, squeeze=squeeze) + return self._groupby_cls(self, group, squeeze=squeeze, + restore_coord_dims=restore_coord_dims) def groupby_bins(self, group, bins, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, - squeeze: bool = True): + squeeze: bool = True, + restore_coord_dims: Optional[bool] = None): """Returns a GroupBy object for performing grouped operations. Rather than using all unique values of `group`, the values are discretized @@ -522,6 +532,9 @@ def groupby_bins(self, group, bins, right: bool = True, labels=None, If "group" is a dimension of any arrays in this dataset, `squeeze` controls whether the subarrays have a dimension of length 1 along that dimension or if the dimension is squeezed out. + restore_coord_dims : bool, optional + If True, also restore the dimension order of multi-dimensional + coordinates. Returns ------- @@ -536,13 +549,15 @@ def groupby_bins(self, group, bins, right: bool = True, labels=None, .. [1] http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html """ # noqa return self._groupby_cls(self, group, squeeze=squeeze, bins=bins, + restore_coord_dims=restore_coord_dims, cut_kwargs={'right': right, 'labels': labels, 'precision': precision, - 'include_lowest': include_lowest}) + 'include_lowest': + include_lowest}) def rolling(self, dim: Optional[Mapping[Hashable, int]] = None, min_periods: Optional[int] = None, center: bool = False, - **dim_kwargs: int): + **window_kwargs: int): """ Rolling window object. @@ -557,9 +572,9 @@ def rolling(self, dim: Optional[Mapping[Hashable, int]] = None, setting min_periods equal to the size of the window. center : boolean, default False Set the labels at the center of the window. - **dim_kwargs : optional + **window_kwargs : optional The keyword arguments form of ``dim``. - One of dim or dim_kwargs must be provided. + One of dim or window_kwargs must be provided. Returns ------- @@ -598,15 +613,54 @@ def rolling(self, dim: Optional[Mapping[Hashable, int]] = None, core.rolling.DataArrayRolling core.rolling.DatasetRolling """ # noqa - dim = either_dict_or_kwargs(dim, dim_kwargs, 'rolling') + dim = either_dict_or_kwargs(dim, window_kwargs, 'rolling') return self._rolling_cls(self, dim, min_periods=min_periods, center=center) + def rolling_exp( + self, + window: Optional[Mapping[Hashable, int]] = None, + window_type: str = 'span', + **window_kwargs + ): + """ + Exponentially-weighted moving window. + Similar to EWM in pandas + + Requires the optional Numbagg dependency. + + Parameters + ---------- + window : A single mapping from a dimension name to window value, + optional + dim : str + Name of the dimension to create the rolling exponential window + along (e.g., `time`). + window : int + Size of the moving window. The type of this is specified in + `window_type` + window_type : str, one of ['span', 'com', 'halflife', 'alpha'], + default 'span' + The format of the previously supplied window. Each is a simple + numerical transformation of the others. Described in detail: + https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.ewm.html + **window_kwargs : optional + The keyword arguments form of ``window``. + One of window or window_kwargs must be provided. + + See Also + -------- + core.rolling_exp.RollingExp + """ + window = either_dict_or_kwargs(window, window_kwargs, 'rolling_exp') + + return self._rolling_exp_cls(self, window, window_type) + def coarsen(self, dim: Optional[Mapping[Hashable, int]] = None, boundary: str = 'exact', side: Union[str, Mapping[Hashable, str]] = 'left', coord_func: str = 'mean', - **dim_kwargs: int): + **window_kwargs: int): """ Coarsen object. @@ -660,7 +714,7 @@ def coarsen(self, dim: Optional[Mapping[Hashable, int]] = None, core.rolling.DataArrayCoarsen core.rolling.DatasetCoarsen """ - dim = either_dict_or_kwargs(dim, dim_kwargs, 'coarsen') + dim = either_dict_or_kwargs(dim, window_kwargs, 'coarsen') return self._coarsen_cls( self, dim, boundary=boundary, side=side, coord_func=coord_func) @@ -669,7 +723,7 @@ def resample(self, indexer: Optional[Mapping[Hashable, str]] = None, skipna=None, closed: Optional[str] = None, label: Optional[str] = None, base: int = 0, keep_attrs: Optional[bool] = None, - loffset=None, + loffset=None, restore_coord_dims: Optional[bool] = None, **indexer_kwargs: str): """Returns a Resample object for performing resampling operations. @@ -697,6 +751,9 @@ def resample(self, indexer: Optional[Mapping[Hashable, str]] = None, If True, the object's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + restore_coord_dims : bool, optional + If True, also restore the dimension order of multi-dimensional + coordinates. **indexer_kwargs : {dim: freq} The keyword arguments form of ``indexer``. One of indexer or indexer_kwargs must be provided. @@ -786,7 +843,8 @@ def resample(self, indexer: Optional[Mapping[Hashable, str]] = None, dims=dim_coord.dims, name=RESAMPLE_DIM) resampler = self._resample_cls(self, group=group, dim=dim_name, grouper=grouper, - resample_dim=RESAMPLE_DIM) + resample_dim=RESAMPLE_DIM, + restore_coord_dims=restore_coord_dims) return resampler diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index ea3eaa0f4f2..6a5795ccdc6 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -193,7 +193,7 @@ def _update_coords(self, coords): self._data._variables = variables self._data._coord_names.update(new_coord_names) - self._data._dims = dict(dims) + self._data._dims = dims self._data._indexes = None def __delitem__(self, key): diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8d3836f5d8c..ff77a6ab704 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2,6 +2,7 @@ import sys import warnings from collections import OrderedDict +from typing import Any import numpy as np import pandas as pd @@ -9,7 +10,8 @@ from ..plot.plot import _PlotMethods from . import ( computation, dtypes, groupby, indexing, ops, resample, rolling, utils) -from .accessors import DatetimeAccessor +from .accessor_dt import DatetimeAccessor +from .accessor_str import StringAccessor from .alignment import align, reindex_like_indexers from .common import AbstractArray, DataWithCoords from .coordinates import ( @@ -66,7 +68,7 @@ def _infer_coords_and_dims(shape, coords, dims): for dim, coord in zip(dims, coords): var = as_variable(coord, name=dim) var.dims = (dim,) - new_coords[dim] = var + new_coords[dim] = var.to_index_variable() sizes = dict(zip(dims, shape)) for k, v in new_coords.items(): @@ -162,6 +164,7 @@ class DataArray(AbstractArray, DataWithCoords): _resample_cls = resample.DataArrayResample dt = property(DatetimeAccessor) + str = property(StringAccessor) def __init__(self, data, coords=None, dims=None, name=None, attrs=None, encoding=None, indexes=None, fastpath=False): @@ -256,8 +259,14 @@ def _replace(self, variable=None, coords=None, name=__default): return type(self)(variable, coords, name=name, fastpath=True) def _replace_maybe_drop_dims(self, variable, name=__default): - if variable.dims == self.dims: + if variable.dims == self.dims and variable.shape == self.shape: coords = self._coords.copy() + elif variable.dims == self.dims: + # Shape has changed (e.g. from reduce(..., keepdims=True) + new_sizes = dict(zip(self.dims, variable.shape)) + coords = OrderedDict((k, v) for k, v in self._coords.items() + if v.shape == tuple(new_sizes[d] + for d in v.dims)) else: allowed_dims = set(variable.dims) coords = OrderedDict((k, v) for k, v in self._coords.items() @@ -1349,7 +1358,7 @@ def stack(self, dimensions=None, **dimensions_kwargs): >>> stacked = arr.stack(z=('x', 'y')) >>> stacked.indexes['z'] MultiIndex(levels=[['a', 'b'], [0, 1, 2]], - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], names=['x', 'y']) See also @@ -1392,7 +1401,7 @@ def unstack(self, dim=None): >>> stacked = arr.stack(z=('x', 'y')) >>> stacked.indexes['z'] MultiIndex(levels=[['a', 'b'], [0, 1, 2]], - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], names=['x', 'y']) >>> roundtripped = stacked.unstack() >>> arr.identical(roundtripped) @@ -1405,7 +1414,7 @@ def unstack(self, dim=None): ds = self._to_temp_dataset().unstack(dim) return self._from_temp_dataset(ds) - def transpose(self, *dims) -> 'DataArray': + def transpose(self, *dims, transpose_coords=None) -> 'DataArray': """Return a new DataArray object with transposed dimensions. Parameters @@ -1413,6 +1422,8 @@ def transpose(self, *dims) -> 'DataArray': *dims : str, optional By default, reverse the dimensions. Otherwise, reorder the dimensions to this order. + transpose_coords : boolean, optional + If True, also transpose the coordinates of this DataArray. Returns ------- @@ -1430,14 +1441,34 @@ def transpose(self, *dims) -> 'DataArray': numpy.transpose Dataset.transpose """ + if dims: + if set(dims) ^ set(self.dims): + raise ValueError('arguments to transpose (%s) must be ' + 'permuted array dimensions (%s)' + % (dims, tuple(self.dims))) + variable = self.variable.transpose(*dims) - return self._replace(variable) + if transpose_coords: + coords = OrderedDict() # type: OrderedDict[Any, Variable] + for name, coord in self.coords.items(): + coord_dims = tuple(dim for dim in dims if dim in coord.dims) + coords[name] = coord.variable.transpose(*coord_dims) + return self._replace(variable, coords) + else: + if transpose_coords is None \ + and any(self[c].ndim > 1 for c in self.coords): + warnings.warn('This DataArray contains multi-dimensional ' + 'coordinates. In the future, these coordinates ' + 'will be transposed as well unless you specify ' + 'transpose_coords=False.', + FutureWarning, stacklevel=2) + return self._replace(variable) @property def T(self) -> 'DataArray': return self.transpose() - def drop(self, labels, dim=None): + def drop(self, labels, dim=None, *, errors='raise'): """Drop coordinates or index labels from this DataArray. Parameters @@ -1447,14 +1478,18 @@ def drop(self, labels, dim=None): dim : str, optional Dimension along which to drop index labels. By default (if ``dim is None``), drops coordinates rather than index labels. - + errors: {'raise', 'ignore'}, optional + If 'raise' (default), raises a ValueError error if + any of the coordinates or index labels passed are not + in the array. If 'ignore', any given labels that are in the + array are dropped and no error is raised. Returns ------- dropped : DataArray """ if utils.is_scalar(labels): labels = [labels] - ds = self._to_temp_dataset().drop(labels, dim) + ds = self._to_temp_dataset().drop(labels, dim, errors=errors) return self._from_temp_dataset(ds) def dropna(self, dim, how='any', thresh=None): @@ -1613,7 +1648,8 @@ def combine_first(self, other): """ return ops.fillna(self, other, join="outer") - def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): + def reduce(self, func, dim=None, axis=None, keep_attrs=None, + keepdims=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters @@ -1633,6 +1669,10 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + keepdims : bool, default False + If True, the dimensions which are reduced are left in the result + as dimensions of size one. Coordinates that use these dimensions + are removed. **kwargs : dict Additional keyword arguments passed on to `func`. @@ -1643,7 +1683,8 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): summarized data and the indicated dimension(s) removed. """ - var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs) + var = self.variable.reduce(func, dim, axis, keep_attrs, keepdims, + **kwargs) return self._replace_maybe_drop_dims(var) def to_pandas(self): @@ -1724,8 +1765,9 @@ def to_masked_array(self, copy=True): result : MaskedArray Masked where invalid values (nan or inf) occur. """ - isnull = pd.isnull(self.values) - return np.ma.MaskedArray(data=self.values, mask=isnull, copy=copy) + values = self.values # only compute lazy arrays once + isnull = pd.isnull(values) + return np.ma.MaskedArray(data=values, mask=isnull, copy=copy) def to_netcdf(self, *args, **kwargs): """Write DataArray contents to a netCDF file. @@ -1992,6 +2034,14 @@ def __array_wrap__(self, obj, context=None): new_var = self.variable.__array_wrap__(obj, context) return self._replace(new_var) + def __matmul__(self, obj): + return self.dot(obj) + + def __rmatmul__(self, other): + # currently somewhat duplicative, as only other DataArrays are + # compatible with matmul + return computation.dot(other, self) + @staticmethod def _unary_op(f): @functools.wraps(f) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e9ec1445dd4..3e00640ba60 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -8,7 +8,7 @@ from distutils.version import LooseVersion from numbers import Number from typing import ( - Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar, Union, Sequence) + Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar, Union) import numpy as np import pandas as pd @@ -35,8 +35,7 @@ from .pycompat import TYPE_CHECKING, dask_array_type from .utils import ( Frozen, SortedKeysDict, _check_inplace, decode_numpy_dict_values, - either_dict_or_kwargs, ensure_us_time_resolution, hashable, is_dict_like, - maybe_wrap_array) + either_dict_or_kwargs, hashable, maybe_wrap_array) from .variable import IndexVariable, Variable, as_variable, broadcast_variables if TYPE_CHECKING: @@ -101,7 +100,7 @@ def calculate_dimensions(variables): Returns dictionary mapping from dimension names to sizes. Raises ValueError if any of the dimension sizes conflict. """ - dims = OrderedDict() + dims = {} last_used = {} scalar_vars = set(k for k, v in variables.items() if not v.dims) for k, var in variables.items(): @@ -693,7 +692,7 @@ def _construct_direct(cls, variables, coord_names, dims, attrs=None, @classmethod def _from_vars_and_coord_names(cls, variables, coord_names, attrs=None): - dims = dict(calculate_dimensions(variables)) + dims = calculate_dimensions(variables) return cls._construct_direct(variables, coord_names, dims, attrs) # TODO(shoyer): renable type checking on this signature when pytype has a @@ -754,18 +753,20 @@ def _replace_with_new_dims( # type: ignore coord_names: set = None, attrs: 'Optional[OrderedDict]' = __default, indexes: 'Optional[OrderedDict[Any, pd.Index]]' = __default, + encoding: Optional[dict] = __default, inplace: bool = False, ) -> T: """Replace variables with recalculated dimensions.""" - dims = dict(calculate_dimensions(variables)) + dims = calculate_dimensions(variables) return self._replace( - variables, coord_names, dims, attrs, indexes, inplace=inplace) + variables, coord_names, dims, attrs, indexes, encoding, + inplace=inplace) def _replace_vars_and_dims( # type: ignore self: T, variables: 'OrderedDict[Any, Variable]' = None, coord_names: set = None, - dims: 'OrderedDict[Any, int]' = None, + dims: Dict[Any, int] = None, attrs: 'Optional[OrderedDict]' = __default, inplace: bool = False, ) -> T: @@ -1081,6 +1082,7 @@ def __delitem__(self, key): """ del self._variables[key] self._coord_names.discard(key) + self._dims = calculate_dimensions(self._variables) # mutable objects should not be hashable # https://github.com/python/mypy/issues/4266 @@ -2470,7 +2472,7 @@ def expand_dims(self, dim=None, axis=None, **dim_kwargs): else: # If dims includes a label of a non-dimension coordinate, # it will be promoted to a 1D coordinate with a single value. - variables[k] = v.set_dims(k) + variables[k] = v.set_dims(k).to_index_variable() new_dims = self._dims.copy() new_dims.update(dim) @@ -2824,7 +2826,7 @@ def _assert_all_in_dataset(self, names, virtual_okay=False): raise ValueError('One or more of the specified variables ' 'cannot be found in this dataset') - def drop(self, labels, dim=None): + def drop(self, labels, dim=None, *, errors='raise'): """Drop variables or index labels from this dataset. Parameters @@ -2834,33 +2836,41 @@ def drop(self, labels, dim=None): dim : None or str, optional Dimension along which to drop index labels. By default (if ``dim is None``), drops variables rather than index labels. + errors: {'raise', 'ignore'}, optional + If 'raise' (default), raises a ValueError error if + any of the variable or index labels passed are not + in the dataset. If 'ignore', any given labels that are in the + dataset are dropped and no error is raised. Returns ------- dropped : Dataset """ + if errors not in ['raise', 'ignore']: + raise ValueError('errors must be either "raise" or "ignore"') if utils.is_scalar(labels): labels = [labels] if dim is None: - return self._drop_vars(labels) + return self._drop_vars(labels, errors=errors) else: try: index = self.indexes[dim] except KeyError: raise ValueError( 'dimension %r does not have coordinate labels' % dim) - new_index = index.drop(labels) + new_index = index.drop(labels, errors=errors) return self.loc[{dim: new_index}] - def _drop_vars(self, names): - self._assert_all_in_dataset(names) + def _drop_vars(self, names, errors='raise'): + if errors == 'raise': + self._assert_all_in_dataset(names) drop = set(names) variables = OrderedDict((k, v) for k, v in self._variables.items() if k not in drop) coord_names = set(k for k in self._coord_names if k in variables) return self._replace_vars_and_dims(variables, coord_names) - def drop_dims(self, drop_dims): + def drop_dims(self, drop_dims, *, errors='raise'): """Drop dimensions and associated variables from this dataset. Parameters @@ -2873,14 +2883,23 @@ def drop_dims(self, drop_dims): obj : Dataset The dataset without the given dimensions (or any variables containing those dimensions) + errors: {'raise', 'ignore'}, optional + If 'raise' (default), raises a ValueError error if + any of the dimensions passed are not + in the dataset. If 'ignore', any given dimensions that are in the + dataset are dropped and no error is raised. """ + if errors not in ['raise', 'ignore']: + raise ValueError('errors must be either "raise" or "ignore"') + if utils.is_scalar(drop_dims): drop_dims = [drop_dims] - missing_dimensions = [d for d in drop_dims if d not in self.dims] - if missing_dimensions: - raise ValueError('Dataset does not contain the dimensions: %s' - % missing_dimensions) + if errors == 'raise': + missing_dimensions = [d for d in drop_dims if d not in self.dims] + if missing_dimensions: + raise ValueError('Dataset does not contain the dimensions: %s' + % missing_dimensions) drop_vars = set(k for k, v in self._variables.items() for d in v.dims if d in drop_dims) @@ -3133,8 +3152,8 @@ def combine_first(self, other): out = ops.fillna(self, other, join="outer", dataset_join="outer") return out - def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, - allow_lazy=False, **kwargs): + def reduce(self, func, dim=None, keep_attrs=None, keepdims=False, + numeric_only=False, allow_lazy=False, **kwargs): """Reduce this dataset by applying `func` along some dimension(s). Parameters @@ -3150,6 +3169,10 @@ def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, If True, the dataset's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + keepdims : bool, default False + If True, the dimensions which are reduced are left in the result + as dimensions of size one. Coordinates that use these dimensions + are removed. numeric_only : bool, optional If True, only apply ``func`` to variables with a numeric dtype. **kwargs : dict @@ -3199,6 +3222,7 @@ def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, reduce_dims = None variables[name] = var.reduce(func, dim=reduce_dims, keep_attrs=keep_attrs, + keepdims=keepdims, allow_lazy=allow_lazy, **kwargs) @@ -3557,12 +3581,15 @@ def from_dict(cls, d): def _unary_op(f, keep_attrs=False): @functools.wraps(f) def func(self, *args, **kwargs): - ds = self.coords.to_dataset() - for k in self.data_vars: - ds._variables[k] = f(self._variables[k], *args, **kwargs) - if keep_attrs: - ds._attrs = self._attrs - return ds + variables = OrderedDict() + for k, v in self._variables.items(): + if k in self._coord_names: + variables[k] = v + else: + variables[k] = f(v, *args, **kwargs) + attrs = self._attrs if keep_attrs else None + return self._replace_with_new_dims( + variables, attrs=attrs, encoding=None) return func @@ -4145,7 +4172,7 @@ def _integrate_one(self, coord, datetime_unit=None): from .variable import Variable if coord not in self.variables and coord not in self.dims: - raise ValueError('Coordinate {} does not exist.'.format(dim)) + raise ValueError('Coordinate {} does not exist.'.format(coord)) coord_var = self[coord].variable if coord_var.ndim != 1: diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index b37e01cb7af..bc66eb71ced 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -4,9 +4,9 @@ accept or return xarray objects. """ import contextlib -from functools import partial import inspect import warnings +from functools import partial import numpy as np import pandas as pd diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 82a92044caf..108e85f729f 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -197,7 +197,7 @@ class GroupBy(SupportsArithmetic): """ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, - cut_kwargs={}): + restore_coord_dims=None, cut_kwargs={}): """Create a GroupBy object Parameters @@ -215,6 +215,9 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, bins : array-like, optional If `bins` is specified, the groups will be discretized into the specified bins by `pandas.cut`. + restore_coord_dims : bool, optional + If True, also restore the dimension order of multi-dimensional + coordinates. cut_kwargs : dict, optional Extra keyword arguments to pass to `pandas.cut` @@ -279,6 +282,16 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, safe_cast_to_index(group), sort=(bins is None)) unique_coord = IndexVariable(group.name, unique_values) + if isinstance(obj, DataArray) \ + and restore_coord_dims is None \ + and any(obj[c].ndim > 1 for c in obj.coords): + warnings.warn('This DataArray contains multi-dimensional ' + 'coordinates. In the future, the dimension order ' + 'of these coordinates will be restored as well ' + 'unless you specify restore_coord_dims=False.', + FutureWarning, stacklevel=2) + restore_coord_dims = False + # specification for the groupby operation self._obj = obj self._group = group @@ -288,6 +301,7 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, self._stacked_dim = stacked_dim self._inserted_dims = inserted_dims self._full_index = full_index + self._restore_coord_dims = restore_coord_dims # cached attributes self._groups = None @@ -508,7 +522,8 @@ def lookup_order(dimension): return axis new_order = sorted(stacked.dims, key=lookup_order) - return stacked.transpose(*new_order) + return stacked.transpose( + *new_order, transpose_coords=self._restore_coord_dims) def apply(self, func, shortcut=False, args=(), **kwargs): """Apply a function over each array in the group and concatenate them @@ -558,7 +573,7 @@ def apply(self, func, shortcut=False, args=(), **kwargs): for arr in grouped) return self._combine(applied, shortcut=shortcut) - def _combine(self, applied, shortcut=False): + def _combine(self, applied, restore_coord_dims=False, shortcut=False): """Recombine the applied objects like the original.""" applied_example, applied = peek_at(applied) coord, dim, positions = self._infer_concat_args(applied_example) @@ -580,8 +595,66 @@ def _combine(self, applied, shortcut=False): combined = self._maybe_unstack(combined) return combined - def reduce(self, func, dim=None, axis=None, - keep_attrs=None, shortcut=True, **kwargs): + def quantile(self, q, dim=None, interpolation='linear', keep_attrs=None): + """Compute the qth quantile over each array in the groups and + concatenate them together into a new array. + + Parameters + ---------- + q : float in range of [0,1] (or sequence of floats) + Quantile to compute, which must be between 0 and 1 + inclusive. + dim : str or sequence of str, optional + Dimension(s) over which to apply quantile. + Defaults to the grouped dimension. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is + the fractional part of the index surrounded by ``i`` and + ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + + Returns + ------- + quantiles : Variable + If `q` is a single quantile, then the result + is a scalar. If multiple percentiles are given, first axis of + the result corresponds to the quantile and a quantile dimension + is added to the return array. The other dimensions are the + dimensions that remain after the reduction of the array. + + See Also + -------- + numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile, + DataArray.quantile + """ + if dim == DEFAULT_DIMS: + dim = ALL_DIMS + # TODO change this to dim = self._group_dim after + # the deprecation process + if self._obj.ndim > 1: + warnings.warn( + "Default reduction dimension will be changed to the " + "grouped dimension in a future version of xarray. To " + "silence this warning, pass dim=xarray.ALL_DIMS " + "explicitly.", + FutureWarning, stacklevel=2) + + out = self.apply(self._obj.__class__.quantile, shortcut=False, + q=q, dim=dim, interpolation=interpolation, + keep_attrs=keep_attrs) + + if np.asarray(q, dtype=np.float64).ndim == 0: + out = out.drop('quantile') + return out + + def reduce(self, func, dim=None, axis=None, keep_attrs=None, + shortcut=True, **kwargs): """Reduce the items in this group by applying `func` along some dimension(s). diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 1effb9347dd..1ba3175dc2f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -453,7 +453,13 @@ def __array__(self, dtype=None): def __getitem__(self, key): key = expanded_indexer(key, self.ndim) - return self.array[self.indexer_cls(key)] + result = self.array[self.indexer_cls(key)] + if isinstance(result, ExplicitlyIndexed): + return type(self)(result, self.indexer_cls) + else: + # Sometimes explicitly indexed arrays return NumPy arrays or + # scalars. + return result class LazilyOuterIndexedArray(ExplicitlyIndexedNDArrayMixin): diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 421ac39ebd8..94a5d4af79a 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -473,7 +473,7 @@ def merge_core(objs, 'coordinates or not in the merged result: %s' % ambiguous_coords) - return variables, coord_names, dict(dims) + return variables, coord_names, dims def merge(objects, compat='no_conflicts', join='outer', fill_value=dtypes.NA): @@ -533,9 +533,14 @@ def merge(objects, compat='no_conflicts', join='outer', fill_value=dtypes.NA): from .dataarray import DataArray from .dataset import Dataset - dict_like_objects = [ - obj.to_dataset() if isinstance(obj, DataArray) else obj - for obj in objects] + dict_like_objects = list() + for obj in objects: + if not (isinstance(obj, (DataArray, Dataset, dict))): + raise TypeError("objects must be an iterable containing only " + "Dataset(s), DataArray(s), and dictionaries.") + + obj = obj.to_dataset() if isinstance(obj, DataArray) else obj + dict_like_objects.append(obj) variables, coord_names, dims = merge_core(dict_like_objects, compat, join, fill_value=fill_value) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 3931512325e..6009983beb2 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -9,7 +9,7 @@ from . import utils from .common import _contains_datetime_like_objects from .computation import apply_ufunc -from .duck_array_ops import dask_array_type, datetime_to_numeric +from .duck_array_ops import dask_array_type from .utils import OrderedSet, is_scalar from .variable import Variable, broadcast_variables diff --git a/xarray/core/options.py b/xarray/core/options.py index d441a81d325..532d86a8f38 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -59,7 +59,7 @@ def _warn_on_setting_enable_cftimeindex(enable_cftimeindex): def _get_keep_attrs(default): global_choice = OPTIONS['keep_attrs'] - if global_choice is 'default': + if global_choice == 'default': return default elif global_choice in [True, False]: return global_choice diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index f76634c65aa..7e2b0bbf6c4 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -57,15 +57,15 @@ def remove_unused_levels(self): -------- >>> i = pd.MultiIndex.from_product([range(2), list('ab')]) MultiIndex(levels=[[0, 1], ['a', 'b']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) >>> i[2:] MultiIndex(levels=[[0, 1], ['a', 'b']], - labels=[[1, 1], [0, 1]]) + codes=[[1, 1], [0, 1]]) The 0 from the first level is not represented and can be removed >>> i[2:].remove_unused_levels() MultiIndex(levels=[[1], ['a', 'b']], - labels=[[0, 0], [0, 1]]) + codes=[[0, 0], [0, 1]]) """ import pandas.core.algorithms as algos diff --git a/xarray/core/resample_cftime.py b/xarray/core/resample_cftime.py index e7f41be8667..cac78aabe98 100644 --- a/xarray/core/resample_cftime.py +++ b/xarray/core/resample_cftime.py @@ -36,14 +36,16 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -from ..coding.cftimeindex import CFTimeIndex -from ..coding.cftime_offsets import (cftime_range, normalize_date, - Day, MonthEnd, QuarterEnd, YearEnd, - CFTIME_TICKS, to_offset) import datetime + import numpy as np import pandas as pd +from ..coding.cftime_offsets import ( + CFTIME_TICKS, Day, MonthEnd, QuarterEnd, YearEnd, cftime_range, + normalize_date, to_offset) +from ..coding.cftimeindex import CFTimeIndex + class CFTimeGrouper: """This is a simple container for the grouping parameters that implements a diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index ad9b17fef92..4773512cdc4 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -7,8 +7,8 @@ from . import dtypes, duck_array_ops, utils from .dask_array_ops import dask_rolling_wrapper from .ops import ( - bn, has_bottleneck, inject_coarsen_methods, - inject_bottleneck_rolling_methods, inject_datasetrolling_methods) + bn, has_bottleneck, inject_bottleneck_rolling_methods, + inject_coarsen_methods, inject_datasetrolling_methods) from .pycompat import dask_array_type @@ -170,15 +170,15 @@ def construct(self, window_dim, stride=1, fill_value=dtypes.NA): -------- >>> da = DataArray(np.arange(8).reshape(2, 4), dims=('a', 'b')) >>> - >>> rolling = da.rolling(a=3) - >>> rolling.to_datarray('window_dim') + >>> rolling = da.rolling(b=3) + >>> rolling.construct('window_dim') array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) Dimensions without coordinates: a, b, window_dim >>> - >>> rolling = da.rolling(a=3, center=True) - >>> rolling.to_datarray('window_dim') + >>> rolling = da.rolling(b=3, center=True) + >>> rolling.construct('window_dim') array([[[np.nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, np.nan]], [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) @@ -211,6 +211,29 @@ def reduce(self, func, **kwargs): ------- reduced : DataArray Array with summarized data. + + Examples + -------- + >>> da = DataArray(np.arange(8).reshape(2, 4), dims=('a', 'b')) + >>> + >>> rolling = da.rolling(b=3) + >>> rolling.construct('window_dim') + + array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], + [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) + Dimensions without coordinates: a, b, window_dim + >>> + >>> rolling.reduce(np.sum) + + array([[nan, nan, 3., 6.], + [nan, nan, 15., 18.]]) + Dimensions without coordinates: a, b + >>> + >>> rolling = da.rolling(b=3, min_periods=1) + >>> rolling.reduce(np.nansum) + + array([[ 0., 1., 3., 6.], + [ 4., 9., 15., 18.]]) """ rolling_dim = utils.get_temp_dimname(self.obj.dims, '_rolling_dim') windows = self.construct(rolling_dim) diff --git a/xarray/core/rolling_exp.py b/xarray/core/rolling_exp.py new file mode 100644 index 00000000000..ff6baef5c3a --- /dev/null +++ b/xarray/core/rolling_exp.py @@ -0,0 +1,106 @@ +import numpy as np + +from .pycompat import dask_array_type + + +def _get_alpha(com=None, span=None, halflife=None, alpha=None): + # pandas defines in terms of com (converting to alpha in the algo) + # so use its function to get a com and then convert to alpha + + com = _get_center_of_mass(com, span, halflife, alpha) + return 1 / (1 + com) + + +def move_exp_nanmean(array, *, axis, alpha): + if isinstance(array, dask_array_type): + raise TypeError("rolling_exp is not currently support for dask arrays") + import numbagg + if axis == (): + return array.astype(np.float64) + else: + return numbagg.move_exp_nanmean( + array, axis=axis, alpha=alpha) + + +def _get_center_of_mass(comass, span, halflife, alpha): + """ + Vendored from pandas.core.window._get_center_of_mass + + See licenses/PANDAS_LICENSE for the function's license + """ + from pandas.core import common as com + valid_count = com.count_not_none(comass, span, halflife, alpha) + if valid_count > 1: + raise ValueError("comass, span, halflife, and alpha " + "are mutually exclusive") + + # Convert to center of mass; domain checks ensure 0 < alpha <= 1 + if comass is not None: + if comass < 0: + raise ValueError("comass must satisfy: comass >= 0") + elif span is not None: + if span < 1: + raise ValueError("span must satisfy: span >= 1") + comass = (span - 1) / 2. + elif halflife is not None: + if halflife <= 0: + raise ValueError("halflife must satisfy: halflife > 0") + decay = 1 - np.exp(np.log(0.5) / halflife) + comass = 1 / decay - 1 + elif alpha is not None: + if alpha <= 0 or alpha > 1: + raise ValueError("alpha must satisfy: 0 < alpha <= 1") + comass = (1.0 - alpha) / alpha + else: + raise ValueError("Must pass one of comass, span, halflife, or alpha") + + return float(comass) + + +class RollingExp: + """ + Exponentially-weighted moving window object. + Similar to EWM in pandas + + Parameters + ---------- + obj : Dataset or DataArray + Object to window. + windows : A single mapping from a single dimension name to window value + dim : str + Name of the dimension to create the rolling exponential window + along (e.g., `time`). + window : int + Size of the moving window. The type of this is specified in + `window_type` + window_type : str, one of ['span', 'com', 'halflife', 'alpha'], default 'span' + The format of the previously supplied window. Each is a simple + numerical transformation of the others. Described in detail: + https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.ewm.html + + Returns + ------- + RollingExp : type of input argument + """ # noqa + + def __init__(self, obj, windows, window_type='span'): + self.obj = obj + dim, window = next(iter(windows.items())) + self.dim = dim + self.alpha = _get_alpha(**{window_type: window}) + + def mean(self): + """ + Exponentially weighted moving average + + Examples + -------- + >>> da = xr.DataArray([1,1,2,2,2], dims='x') + >>> da.rolling_exp(x=2, window_type='span').mean() + + array([1. , 1. , 1.692308, 1.9 , 1.966942]) + Dimensions without coordinates: x + """ + + return self.obj.reduce( + move_exp_nanmean, dim=self.dim, alpha=self.alpha) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 94787dd35e2..9b762ab99c7 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -7,16 +7,16 @@ import re import warnings from collections import OrderedDict -from typing import (AbstractSet, Any, Callable, Container, Dict, Hashable, - Iterable, Iterator, Optional, Sequence, - Tuple, TypeVar, cast) +from typing import ( + AbstractSet, Any, Callable, Container, Dict, Hashable, Iterable, Iterator, + Mapping, MutableMapping, MutableSet, Optional, Sequence, Tuple, TypeVar, + cast) import numpy as np import pandas as pd from .pycompat import dask_array_type -from typing import Mapping, MutableMapping, MutableSet try: # Fix typed collections in Python 3.5.0~3.5.2 from .pycompat import Mapping, MutableMapping, MutableSet # noqa: F811 except ImportError: @@ -62,7 +62,7 @@ def wrapper(*args, **kwargs): def _maybe_cast_to_cftimeindex(index: pd.Index) -> pd.Index: from ..coding.cftimeindex import CFTimeIndex - if index.dtype == 'O': + if len(index) > 0 and index.dtype == 'O': try: return CFTimeIndex(index) except (ImportError, TypeError): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 41f8795b595..cccb9663ad5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -3,6 +3,7 @@ import typing from collections import OrderedDict, defaultdict from datetime import timedelta +from distutils.version import LooseVersion import numpy as np import pandas as pd @@ -870,6 +871,7 @@ def chunk(self, chunks=None, name=None, lock=False): ------- chunked : xarray.Variable """ + import dask import dask.array as da if utils.is_dict_like(chunks): @@ -892,7 +894,17 @@ def chunk(self, chunks=None, name=None, lock=False): # https://github.com/dask/dask/issues/2883 data = indexing.ImplicitToExplicitIndexingAdapter( data, indexing.OuterIndexer) - data = da.from_array(data, chunks, name=name, lock=lock) + + # For now, assume that all arrays that we wrap with dask (including + # our lazily loaded backend array classes) should use NumPy array + # operations. + if LooseVersion(dask.__version__) > '1.2.2': + kwargs = dict(meta=np.ndarray) + else: + kwargs = dict() + + data = da.from_array( + data, chunks, name=name, lock=lock, **kwargs) return type(self)(self.dims, data, self._attrs, self._encoding, fastpath=True) @@ -1334,7 +1346,7 @@ def where(self, cond, other=dtypes.NA): return ops.where_method(self, cond, other) def reduce(self, func, dim=None, axis=None, - keep_attrs=None, allow_lazy=False, **kwargs): + keep_attrs=None, keepdims=False, allow_lazy=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters @@ -1354,6 +1366,9 @@ def reduce(self, func, dim=None, axis=None, If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + keepdims : bool, default False + If True, the dimensions which are reduced are left in the result + as dimensions of size one **kwargs : dict Additional keyword arguments passed on to `func`. @@ -1381,8 +1396,19 @@ def reduce(self, func, dim=None, axis=None, else: removed_axes = (range(self.ndim) if axis is None else np.atleast_1d(axis) % self.ndim) - dims = [adim for n, adim in enumerate(self.dims) - if n not in removed_axes] + if keepdims: + # Insert np.newaxis for removed dims + slices = tuple(np.newaxis if i in removed_axes else + slice(None, None) for i in range(self.ndim)) + if getattr(data, 'shape', None) is None: + # Reduce has produced a scalar value, not an array-like + data = np.asanyarray(data)[slices] + else: + data = data[slices] + dims = self.dims + else: + dims = [adim for n, adim in enumerate(self.dims) + if n not in removed_axes] if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) diff --git a/xarray/plot/__init__.py b/xarray/plot/__init__.py index 51712e78bf8..adda541c21d 100644 --- a/xarray/plot/__init__.py +++ b/xarray/plot/__init__.py @@ -1,7 +1,5 @@ -from .plot import (plot, line, step, contourf, contour, - hist, imshow, pcolormesh) - from .facetgrid import FacetGrid +from .plot import contour, contourf, hist, imshow, line, pcolormesh, plot, step __all__ = [ 'plot', diff --git a/xarray/plot/facetgrid.py b/xarray/plot/facetgrid.py index 4a8d77d7b86..9d2b4848319 100644 --- a/xarray/plot/facetgrid.py +++ b/xarray/plot/facetgrid.py @@ -1,14 +1,13 @@ import functools import itertools import warnings -from inspect import getfullargspec import numpy as np from ..core.formatting import format_item from .utils import ( - _infer_xy_labels, _process_cmap_cbar_kwargs, - import_matplotlib_pyplot, label_from_attrs) + _infer_xy_labels, _process_cmap_cbar_kwargs, import_matplotlib_pyplot, + label_from_attrs) # Overrides axes.labelsize, xtick.major.size, ytick.major.size # from mpl.rcParams @@ -483,7 +482,7 @@ def map(self, func, *args, **kwargs): # TODO: better way to verify that an artist is mappable? # https://stackoverflow.com/questions/33023036/is-it-possible-to-detect-if-a-matplotlib-artist-is-a-mappable-suitable-for-use-w#33023522 if (maybe_mappable and - hasattr(maybe_mappable, 'autoscale_None')): + hasattr(maybe_mappable, 'autoscale_None')): self._mappables.append(maybe_mappable) self._finalize_grid(*args[:2]) diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index 316d4fb4dd9..d4cb1a7726b 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -64,8 +64,10 @@ def _infer_line_data(darray, x, y, hue): if huename in darray.dims: otherindex = 1 if darray.dims.index(huename) == 0 else 0 otherdim = darray.dims[otherindex] - yplt = darray.transpose(otherdim, huename) - xplt = xplt.transpose(otherdim, huename) + yplt = darray.transpose( + otherdim, huename, transpose_coords=False) + xplt = xplt.transpose( + otherdim, huename, transpose_coords=False) else: raise ValueError('For 2D inputs, hue must be a dimension' + ' i.e. one of ' + repr(darray.dims)) @@ -79,7 +81,9 @@ def _infer_line_data(darray, x, y, hue): if yplt.ndim > 1: if huename in darray.dims: otherindex = 1 if darray.dims.index(huename) == 0 else 0 - xplt = darray.transpose(otherdim, huename) + otherdim = darray.dims[otherindex] + xplt = darray.transpose( + otherdim, huename, transpose_coords=False) else: raise ValueError('For 2D inputs, hue must be a dimension' + ' i.e. one of ' + repr(darray.dims)) @@ -614,9 +618,9 @@ def newplotfunc(darray, x=None, y=None, figsize=None, size=None, yx_dims = (ylab, xlab) dims = yx_dims + tuple(d for d in darray.dims if d not in yx_dims) if dims != darray.dims: - darray = darray.transpose(*dims) + darray = darray.transpose(*dims, transpose_coords=True) elif darray[xlab].dims[-1] == darray.dims[0]: - darray = darray.transpose() + darray = darray.transpose(transpose_coords=True) # Pass the data as a masked ndarray too zval = darray.to_masked_array(copy=False) diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index 0a507993cd6..c9f72b177c6 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -2,15 +2,14 @@ import textwrap import warnings from datetime import datetime +from distutils.version import LooseVersion +from inspect import getfullargspec import numpy as np import pandas as pd -from inspect import getfullargspec - from ..core.options import OPTIONS from ..core.utils import is_scalar -from distutils.version import LooseVersion try: import nc_time_axis @@ -265,8 +264,7 @@ def _determine_cmap_params(plot_data, vmin=None, vmax=None, cmap=None, if extend is None: extend = _determine_extend(calc_data, vmin, vmax) - if ((levels is not None or isinstance(norm, mpl.colors.BoundaryNorm)) - and (not isinstance(cmap, mpl.colors.Colormap))): + if levels is not None or isinstance(norm, mpl.colors.BoundaryNorm): cmap, newnorm = _build_discrete_cmap(cmap, levels, extend, filled) norm = newnorm if norm is None else norm diff --git a/xarray/testing.py b/xarray/testing.py index eb8a0e8603d..42c91b1eda2 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -1,11 +1,15 @@ """Testing functions exposed to the user API""" from collections import OrderedDict +from typing import Hashable, Union import numpy as np import pandas as pd from xarray.core import duck_array_ops from xarray.core import formatting +from xarray.core.dataarray import DataArray +from xarray.core.dataset import Dataset +from xarray.core.variable import IndexVariable, Variable from xarray.core.indexes import default_indexes @@ -49,12 +53,11 @@ def assert_equal(a, b): assert_identical, assert_allclose, Dataset.equals, DataArray.equals, numpy.testing.assert_array_equal """ - import xarray as xr __tracebackhide__ = True # noqa: F841 assert type(a) == type(b) # noqa - if isinstance(a, (xr.Variable, xr.DataArray)): + if isinstance(a, (Variable, DataArray)): assert a.equals(b), formatting.diff_array_repr(a, b, 'equals') - elif isinstance(a, xr.Dataset): + elif isinstance(a, Dataset): assert a.equals(b), formatting.diff_dataset_repr(a, b, 'equals') else: raise TypeError('{} not supported by assertion comparison' @@ -78,15 +81,14 @@ def assert_identical(a, b): -------- assert_equal, assert_allclose, Dataset.equals, DataArray.equals """ - import xarray as xr __tracebackhide__ = True # noqa: F841 assert type(a) == type(b) # noqa - if isinstance(a, xr.Variable): + if isinstance(a, Variable): assert a.identical(b), formatting.diff_array_repr(a, b, 'identical') - elif isinstance(a, xr.DataArray): + elif isinstance(a, DataArray): assert a.name == b.name assert a.identical(b), formatting.diff_array_repr(a, b, 'identical') - elif isinstance(a, (xr.Dataset, xr.Variable)): + elif isinstance(a, (Dataset, Variable)): assert a.identical(b), formatting.diff_dataset_repr(a, b, 'identical') else: raise TypeError('{} not supported by assertion comparison' @@ -118,15 +120,14 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): -------- assert_identical, assert_equal, numpy.testing.assert_allclose """ - import xarray as xr __tracebackhide__ = True # noqa: F841 assert type(a) == type(b) # noqa kwargs = dict(rtol=rtol, atol=atol, decode_bytes=decode_bytes) - if isinstance(a, xr.Variable): + if isinstance(a, Variable): assert a.dims == b.dims allclose = _data_allclose_or_equiv(a.values, b.values, **kwargs) assert allclose, '{}\n{}'.format(a.values, b.values) - elif isinstance(a, xr.DataArray): + elif isinstance(a, DataArray): assert_allclose(a.variable, b.variable, **kwargs) assert set(a.coords) == set(b.coords) for v in a.coords.variables: @@ -136,7 +137,7 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): b.coords[v].values, **kwargs) assert allclose, '{}\n{}'.format(a.coords[v].values, b.coords[v].values) - elif isinstance(a, xr.Dataset): + elif isinstance(a, Dataset): assert set(a.data_vars) == set(b.data_vars) assert set(a.coords) == set(b.coords) for k in list(a.variables) + list(a.coords): @@ -148,14 +149,12 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): def _assert_indexes_invariants_checks(indexes, possible_coord_variables, dims): - import xarray as xr - assert isinstance(indexes, OrderedDict), indexes assert all(isinstance(v, pd.Index) for v in indexes.values()), \ {k: type(v) for k, v in indexes.items()} index_vars = {k for k, v in possible_coord_variables.items() - if isinstance(v, xr.IndexVariable)} + if isinstance(v, IndexVariable)} assert indexes.keys() <= index_vars, (set(indexes), index_vars) # Note: when we support non-default indexes, these checks should be opt-in @@ -167,17 +166,97 @@ def _assert_indexes_invariants_checks(indexes, possible_coord_variables, dims): (indexes, defaults) -def _assert_indexes_invariants(a): - """Separate helper function for checking indexes invariants only.""" - import xarray as xr - - if isinstance(a, xr.DataArray): - if a._indexes is not None: - _assert_indexes_invariants_checks(a._indexes, a._coords, a.dims) - elif isinstance(a, xr.Dataset): - if a._indexes is not None: - _assert_indexes_invariants_checks( - a._indexes, a._variables, a._dims) - elif isinstance(a, xr.Variable): - # no indexes - pass +def _assert_variable_invariants(var: Variable, name: Hashable = None): + if name is None: + name_or_empty = () # type: tuple + else: + name_or_empty = (name,) + assert isinstance(var._dims, tuple), name_or_empty + (var._dims,) + assert len(var._dims) == len(var._data.shape), \ + name_or_empty + (var._dims, var._data.shape) + assert isinstance(var._encoding, (type(None), dict)), \ + name_or_empty + (var._encoding,) + assert isinstance(var._attrs, (type(None), OrderedDict)), \ + name_or_empty + (var._attrs,) + + +def _assert_dataarray_invariants(da: DataArray): + assert isinstance(da._variable, Variable), da._variable + _assert_variable_invariants(da._variable) + + assert isinstance(da._coords, OrderedDict), da._coords + assert all( + isinstance(v, Variable) for v in da._coords.values()), da._coords + assert all(set(v.dims) <= set(da.dims) for v in da._coords.values()), \ + (da.dims, {k: v.dims for k, v in da._coords.items()}) + assert all(isinstance(v, IndexVariable) + for (k, v) in da._coords.items() + if v.dims == (k,)), \ + {k: type(v) for k, v in da._coords.items()} + for k, v in da._coords.items(): + _assert_variable_invariants(v, k) + + if da._indexes is not None: + _assert_indexes_invariants_checks(da._indexes, da._coords, da.dims) + + assert da._initialized is True + + +def _assert_dataset_invariants(ds: Dataset): + assert isinstance(ds._variables, OrderedDict), type(ds._variables) + assert all( + isinstance(v, Variable) for v in ds._variables.values()), \ + ds._variables + for k, v in ds._variables.items(): + _assert_variable_invariants(v, k) + + assert isinstance(ds._coord_names, set), ds._coord_names + assert ds._coord_names <= ds._variables.keys(), \ + (ds._coord_names, set(ds._variables)) + + assert type(ds._dims) is dict, ds._dims + assert all(isinstance(v, int) for v in ds._dims.values()), ds._dims + var_dims = set() # type: set + for v in ds._variables.values(): + var_dims.update(v.dims) + assert ds._dims.keys() == var_dims, (set(ds._dims), var_dims) + assert all(ds._dims[k] == v.sizes[k] + for v in ds._variables.values() + for k in v.sizes), \ + (ds._dims, {k: v.sizes for k, v in ds._variables.items()}) + assert all(isinstance(v, IndexVariable) + for (k, v) in ds._variables.items() + if v.dims == (k,)), \ + {k: type(v) for k, v in ds._variables.items() if v.dims == (k,)} + assert all(v.dims == (k,) + for (k, v) in ds._variables.items() + if k in ds._dims), \ + {k: v.dims for k, v in ds._variables.items() if k in ds._dims} + + if ds._indexes is not None: + _assert_indexes_invariants_checks(ds._indexes, ds._variables, ds._dims) + + assert isinstance(ds._encoding, (type(None), dict)) + assert isinstance(ds._attrs, (type(None), OrderedDict)) + assert ds._initialized is True + + +def _assert_internal_invariants( + xarray_obj: Union[DataArray, Dataset, Variable], +): + """Validate that an xarray object satisfies its own internal invariants. + + This exists for the benefit of xarray's own test suite, but may be useful + in external projects if they (ill-advisedly) create objects using xarray's + private APIs. + """ + if isinstance(xarray_obj, Variable): + _assert_variable_invariants(xarray_obj) + elif isinstance(xarray_obj, DataArray): + _assert_dataarray_invariants(xarray_obj) + elif isinstance(xarray_obj, Dataset): + _assert_dataset_invariants(xarray_obj) + else: + raise TypeError( + '{} is not a supported type for xarray invariant checks' + .format(type(xarray_obj))) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index e9d670e4dd9..81bb1a1e18d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -1,19 +1,19 @@ +import importlib +import re import warnings from contextlib import contextmanager from distutils import version -import re -import importlib -from unittest import mock +from unittest import mock # noqa import numpy as np -from numpy.testing import assert_array_equal # noqa: F401 -from xarray.core.duck_array_ops import allclose_or_equiv # noqa import pytest +from numpy.testing import assert_array_equal # noqa: F401 +import xarray.testing from xarray.core import utils -from xarray.core.options import set_options +from xarray.core.duck_array_ops import allclose_or_equiv # noqa from xarray.core.indexing import ExplicitlyIndexed -import xarray.testing +from xarray.core.options import set_options from xarray.plot.utils import import_seaborn try: @@ -74,6 +74,7 @@ def LooseVersion(vstring): has_np113, requires_np113 = _importorskip('numpy', minversion='1.13.0') has_iris, requires_iris = _importorskip('iris') has_cfgrib, requires_cfgrib = _importorskip('cfgrib') +has_numbagg, requires_numbagg = _importorskip('numbagg') # some special cases has_h5netcdf07, requires_h5netcdf07 = _importorskip('h5netcdf', @@ -108,23 +109,8 @@ def LooseVersion(vstring): else: dask.config.set(scheduler='single-threaded') -# pytest config -try: - _SKIP_FLAKY = not pytest.config.getoption("--run-flaky") - _SKIP_NETWORK_TESTS = not pytest.config.getoption("--run-network-tests") -except (ValueError, AttributeError): - # Can't get config from pytest, e.g., because xarray is installed instead - # of being run from a development version (and hence conftests.py is not - # available). Don't run flaky tests. - _SKIP_FLAKY = True - _SKIP_NETWORK_TESTS = True - -flaky = pytest.mark.skipif( - _SKIP_FLAKY, reason="set --run-flaky option to run flaky tests") -network = pytest.mark.skipif( - _SKIP_NETWORK_TESTS, - reason="set --run-network-tests option to run tests requiring an " - "internet connection") +flaky = pytest.mark.flaky +network = pytest.mark.network @contextmanager @@ -183,21 +169,20 @@ def source_ndarray(array): # Internal versions of xarray's test functions that validate additional # invariants -# TODO: add more invariant checks. def assert_equal(a, b): xarray.testing.assert_equal(a, b) - xarray.testing._assert_indexes_invariants(a) - xarray.testing._assert_indexes_invariants(b) + xarray.testing._assert_internal_invariants(a) + xarray.testing._assert_internal_invariants(b) def assert_identical(a, b): xarray.testing.assert_identical(a, b) - xarray.testing._assert_indexes_invariants(a) - xarray.testing._assert_indexes_invariants(b) + xarray.testing._assert_internal_invariants(a) + xarray.testing._assert_internal_invariants(b) def assert_allclose(a, b, **kwargs): xarray.testing.assert_allclose(a, b, **kwargs) - xarray.testing._assert_indexes_invariants(a) - xarray.testing._assert_indexes_invariants(b) + xarray.testing._assert_internal_invariants(a) + xarray.testing._assert_internal_invariants(b) diff --git a/xarray/tests/test_accessors.py b/xarray/tests/test_accessor_dt.py similarity index 90% rename from xarray/tests/test_accessors.py rename to xarray/tests/test_accessor_dt.py index 6bda5772143..09041a6a69f 100644 --- a/xarray/tests/test_accessors.py +++ b/xarray/tests/test_accessor_dt.py @@ -42,6 +42,10 @@ def test_field_access(self): assert_equal(days, self.data.time.dt.day) assert_equal(hours, self.data.time.dt.hour) + def test_strftime(self): + assert ('2000-01-01 01:00:00' == self.data.time.dt.strftime( + '%Y-%m-%d %H:%M:%S')[1]) + def test_not_datetime_type(self): nontime_data = self.data.copy() int_data = np.arange(len(self.data.time)).astype('int8') @@ -60,6 +64,7 @@ def test_dask_field_access(self): floor = self.times_data.dt.floor('D') ceil = self.times_data.dt.ceil('D') round = self.times_data.dt.round('D') + strftime = self.times_data.dt.strftime('%Y-%m-%d %H:%M:%S') dask_times_arr = da.from_array(self.times_arr, chunks=(5, 5, 50)) dask_times_2d = xr.DataArray(dask_times_arr, @@ -73,12 +78,14 @@ def test_dask_field_access(self): dask_floor = dask_times_2d.dt.floor('D') dask_ceil = dask_times_2d.dt.ceil('D') dask_round = dask_times_2d.dt.round('D') + dask_strftime = dask_times_2d.dt.strftime('%Y-%m-%d %H:%M:%S') # Test that the data isn't eagerly evaluated assert isinstance(dask_year.data, da.Array) assert isinstance(dask_month.data, da.Array) assert isinstance(dask_day.data, da.Array) assert isinstance(dask_hour.data, da.Array) + assert isinstance(dask_strftime.data, da.Array) # Double check that outcome chunksize is unchanged dask_chunks = dask_times_2d.chunks @@ -86,6 +93,7 @@ def test_dask_field_access(self): assert dask_month.data.chunks == dask_chunks assert dask_day.data.chunks == dask_chunks assert dask_hour.data.chunks == dask_chunks + assert dask_strftime.data.chunks == dask_chunks # Check the actual output from the accessors assert_equal(years, dask_year.compute()) @@ -95,6 +103,7 @@ def test_dask_field_access(self): assert_equal(floor, dask_floor.compute()) assert_equal(ceil, dask_ceil.compute()) assert_equal(round, dask_round.compute()) + assert_equal(strftime, dask_strftime.compute()) def test_seasons(self): dates = pd.date_range(start="2000/01/01", freq="M", periods=12) @@ -169,6 +178,21 @@ def test_field_access(data, field): assert_equal(result, expected) +@pytest.mark.skipif(not has_cftime, reason='cftime not installed') +def test_cftime_strftime_access(data): + """ compare cftime formatting against datetime formatting """ + date_format = '%Y%m%d%H' + result = data.time.dt.strftime(date_format) + datetime_array = xr.DataArray( + xr.coding.cftimeindex.CFTimeIndex( + data.time.values).to_datetimeindex(), + name="stftime", + coords=data.time.coords, + dims=data.time.dims) + expected = datetime_array.dt.strftime(date_format) + assert_equal(result, expected) + + @pytest.mark.skipif(not has_dask, reason='dask not installed') @pytest.mark.skipif(not has_cftime, reason='cftime not installed') @pytest.mark.parametrize('field', ['year', 'month', 'day', 'hour', diff --git a/xarray/tests/test_accessor_str.py b/xarray/tests/test_accessor_str.py new file mode 100644 index 00000000000..800096b806b --- /dev/null +++ b/xarray/tests/test_accessor_str.py @@ -0,0 +1,659 @@ +# Tests for the `str` accessor are derived from the original +# pandas string accessor tests. + +# For reference, here is a copy of the pandas copyright notice: + +# (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +# All rights reserved. + +# Copyright (c) 2008-2011 AQR Capital Management, LLC +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. + +# * Neither the name of the copyright holder nor the names of any +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import re + +import numpy as np +import pytest + +import xarray as xr + +from . import assert_equal, requires_dask + + +@pytest.fixture(params=[np.str_, np.bytes_]) +def dtype(request): + return request.param + + +@requires_dask +def test_dask(): + import dask.array as da + arr = da.from_array(['a', 'b', 'c']) + xarr = xr.DataArray(arr) + + result = xarr.str.len().compute() + expected = xr.DataArray([1, 1, 1]) + assert_equal(result, expected) + + +def test_count(dtype): + values = xr.DataArray(['foo', 'foofoo', 'foooofooofommmfoo']).astype(dtype) + result = values.str.count('f[o]+') + expected = xr.DataArray([1, 2, 4]) + assert_equal(result, expected) + + +def test_contains(dtype): + values = xr.DataArray(['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_']).astype(dtype) + # case insensitive using regex + result = values.str.contains('FOO|mmm', case=False) + expected = xr.DataArray([True, False, True, True]) + assert_equal(result, expected) + # case insensitive without regex + result = values.str.contains('foo', regex=False, case=False) + expected = xr.DataArray([True, False, True, False]) + assert_equal(result, expected) + + +def test_starts_ends_with(dtype): + values = xr.DataArray( + ['om', 'foo_nom', 'nom', 'bar_foo', 'foo']).astype(dtype) + result = values.str.startswith('foo') + expected = xr.DataArray([False, True, False, False, True]) + assert_equal(result, expected) + result = values.str.endswith('foo') + expected = xr.DataArray([False, False, False, True, True]) + assert_equal(result, expected) + + +def test_case(dtype): + da = xr.DataArray(['SOme word']).astype(dtype) + capitalized = xr.DataArray(['Some word']).astype(dtype) + lowered = xr.DataArray(['some word']).astype(dtype) + swapped = xr.DataArray(['soME WORD']).astype(dtype) + titled = xr.DataArray(['Some Word']).astype(dtype) + uppered = xr.DataArray(['SOME WORD']).astype(dtype) + assert_equal(da.str.capitalize(), capitalized) + assert_equal(da.str.lower(), lowered) + assert_equal(da.str.swapcase(), swapped) + assert_equal(da.str.title(), titled) + assert_equal(da.str.upper(), uppered) + + +def test_replace(dtype): + values = xr.DataArray(['fooBAD__barBAD']).astype(dtype) + result = values.str.replace('BAD[_]*', '') + expected = xr.DataArray(['foobar']).astype(dtype) + assert_equal(result, expected) + + result = values.str.replace('BAD[_]*', '', n=1) + expected = xr.DataArray(['foobarBAD']).astype(dtype) + assert_equal(result, expected) + + s = xr.DataArray(['A', 'B', 'C', 'Aaba', 'Baca', '', + 'CABA', 'dog', 'cat']).astype(dtype) + result = s.str.replace('A', 'YYY') + expected = xr.DataArray(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', 'CYYYBYYY', + 'dog', 'cat']).astype(dtype) + assert_equal(result, expected) + + result = s.str.replace('A', 'YYY', case=False) + expected = xr.DataArray(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', + '', 'CYYYBYYY', 'dog', 'cYYYt']).astype(dtype) + assert_equal(result, expected) + + result = s.str.replace('^.a|dog', 'XX-XX ', case=False) + expected = xr.DataArray(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', + 'XX-XX BA', 'XX-XX ', 'XX-XX t']).astype(dtype) + assert_equal(result, expected) + + +def test_replace_callable(): + values = xr.DataArray(['fooBAD__barBAD']) + # test with callable + repl = lambda m: m.group(0).swapcase() # noqa + result = values.str.replace('[a-z][A-Z]{2}', repl, n=2) + exp = xr.DataArray(['foObaD__baRbaD']) + assert_equal(result, exp) + # test regex named groups + values = xr.DataArray(['Foo Bar Baz']) + pat = r"(?P\w+) (?P\w+) (?P\w+)" + repl = lambda m: m.group('middle').swapcase() # noqa + result = values.str.replace(pat, repl) + exp = xr.DataArray(['bAR']) + assert_equal(result, exp) + + +def test_replace_unicode(): + # flags + unicode + values = xr.DataArray([b"abcd,\xc3\xa0".decode("utf-8")]) + expected = xr.DataArray([b"abcd, \xc3\xa0".decode("utf-8")]) + pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) + result = values.str.replace(pat, ", ") + assert_equal(result, expected) + + +def test_replace_compiled_regex(dtype): + values = xr.DataArray(['fooBAD__barBAD']).astype(dtype) + # test with compiled regex + pat = re.compile(dtype('BAD[_]*')) + result = values.str.replace(pat, '') + expected = xr.DataArray(['foobar']).astype(dtype) + assert_equal(result, expected) + + result = values.str.replace(pat, '', n=1) + expected = xr.DataArray(['foobarBAD']).astype(dtype) + assert_equal(result, expected) + + # case and flags provided to str.replace will have no effect + # and will produce warnings + values = xr.DataArray(['fooBAD__barBAD__bad']).astype(dtype) + pat = re.compile(dtype('BAD[_]*')) + + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, '', flags=re.IGNORECASE) + + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, '', case=False) + + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, '', case=True) + + # test with callable + values = xr.DataArray(['fooBAD__barBAD']).astype(dtype) + repl = lambda m: m.group(0).swapcase() + pat = re.compile(dtype('[a-z][A-Z]{2}')) + result = values.str.replace(pat, repl, n=2) + expected = xr.DataArray(['foObaD__baRbaD']).astype(dtype) + assert_equal(result, expected) + + +def test_replace_literal(dtype): + # GH16808 literal replace (regex=False vs regex=True) + values = xr.DataArray(['f.o', 'foo']).astype(dtype) + expected = xr.DataArray(['bao', 'bao']).astype(dtype) + result = values.str.replace('f.', 'ba') + assert_equal(result, expected) + + expected = xr.DataArray(['bao', 'foo']).astype(dtype) + result = values.str.replace('f.', 'ba', regex=False) + assert_equal(result, expected) + + # Cannot do a literal replace if given a callable repl or compiled + # pattern + callable_repl = lambda m: m.group(0).swapcase() + compiled_pat = re.compile('[a-z][A-Z]{2}') + + msg = "Cannot use a callable replacement when regex=False" + with pytest.raises(ValueError, match=msg): + values.str.replace('abc', callable_repl, regex=False) + + msg = "Cannot use a compiled regex as replacement pattern with regex=False" + with pytest.raises(ValueError, match=msg): + values.str.replace(compiled_pat, '', regex=False) + + +def test_repeat(dtype): + values = xr.DataArray(['a', 'b', 'c', 'd']).astype(dtype) + result = values.str.repeat(3) + expected = xr.DataArray(['aaa', 'bbb', 'ccc', 'ddd']).astype(dtype) + assert_equal(result, expected) + + +def test_match(dtype): + # New match behavior introduced in 0.13 + values = xr.DataArray(['fooBAD__barBAD', 'foo']).astype(dtype) + result = values.str.match('.*(BAD[_]+).*(BAD)') + expected = xr.DataArray([True, False]) + assert_equal(result, expected) + + values = xr.DataArray(['fooBAD__barBAD', 'foo']).astype(dtype) + result = values.str.match('.*BAD[_]+.*BAD') + expected = xr.DataArray([True, False]) + assert_equal(result, expected) + + +def test_empty_str_methods(): + empty = xr.DataArray(np.empty(shape=(0,), dtype='U')) + empty_str = empty + empty_int = xr.DataArray(np.empty(shape=(0,), dtype=int)) + empty_bool = xr.DataArray(np.empty(shape=(0,), dtype=bool)) + empty_bytes = xr.DataArray(np.empty(shape=(0,), dtype='S')) + + assert_equal(empty_str, empty.str.title()) + assert_equal(empty_int, empty.str.count('a')) + assert_equal(empty_bool, empty.str.contains('a')) + assert_equal(empty_bool, empty.str.startswith('a')) + assert_equal(empty_bool, empty.str.endswith('a')) + assert_equal(empty_str, empty.str.lower()) + assert_equal(empty_str, empty.str.upper()) + assert_equal(empty_str, empty.str.replace('a', 'b')) + assert_equal(empty_str, empty.str.repeat(3)) + assert_equal(empty_bool, empty.str.match('^a')) + assert_equal(empty_int, empty.str.len()) + assert_equal(empty_int, empty.str.find('a')) + assert_equal(empty_int, empty.str.rfind('a')) + assert_equal(empty_str, empty.str.pad(42)) + assert_equal(empty_str, empty.str.center(42)) + assert_equal(empty_str, empty.str.slice(stop=1)) + assert_equal(empty_str, empty.str.slice(step=1)) + assert_equal(empty_str, empty.str.strip()) + assert_equal(empty_str, empty.str.lstrip()) + assert_equal(empty_str, empty.str.rstrip()) + assert_equal(empty_str, empty.str.wrap(42)) + assert_equal(empty_str, empty.str.get(0)) + assert_equal(empty_str, empty_bytes.str.decode('ascii')) + assert_equal(empty_bytes, empty.str.encode('ascii')) + assert_equal(empty_str, empty.str.isalnum()) + assert_equal(empty_str, empty.str.isalpha()) + assert_equal(empty_str, empty.str.isdigit()) + assert_equal(empty_str, empty.str.isspace()) + assert_equal(empty_str, empty.str.islower()) + assert_equal(empty_str, empty.str.isupper()) + assert_equal(empty_str, empty.str.istitle()) + assert_equal(empty_str, empty.str.isnumeric()) + assert_equal(empty_str, empty.str.isdecimal()) + assert_equal(empty_str, empty.str.capitalize()) + assert_equal(empty_str, empty.str.swapcase()) + table = str.maketrans('a', 'b') + assert_equal(empty_str, empty.str.translate(table)) + + +def test_ismethods(dtype): + values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' '] + str_s = xr.DataArray(values).astype(dtype) + alnum_e = [True, True, True, True, True, False, True, True, False, False] + alpha_e = [True, True, True, False, False, False, True, False, False, + False] + digit_e = [False, False, False, True, False, False, False, True, False, + False] + space_e = [False, False, False, False, False, False, False, False, + False, True] + lower_e = [False, True, False, False, False, False, False, False, + False, False] + upper_e = [True, False, False, False, True, False, True, False, False, + False] + title_e = [True, False, True, False, True, False, False, False, False, + False] + + assert_equal(str_s.str.isalnum(), xr.DataArray(alnum_e)) + assert_equal(str_s.str.isalpha(), xr.DataArray(alpha_e)) + assert_equal(str_s.str.isdigit(), xr.DataArray(digit_e)) + assert_equal(str_s.str.isspace(), xr.DataArray(space_e)) + assert_equal(str_s.str.islower(), xr.DataArray(lower_e)) + assert_equal(str_s.str.isupper(), xr.DataArray(upper_e)) + assert_equal(str_s.str.istitle(), xr.DataArray(title_e)) + + +def test_isnumeric(): + # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER + # 0x2605: ★ not number + # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY + # 0xFF13: 3 Em 3 + values = ['A', '3', '¼', '★', '፸', '3', 'four'] + s = xr.DataArray(values) + numeric_e = [False, True, True, False, True, True, False] + decimal_e = [False, True, False, False, False, True, False] + assert_equal(s.str.isnumeric(), xr.DataArray(numeric_e)) + assert_equal(s.str.isdecimal(), xr.DataArray(decimal_e)) + + +def test_len(dtype): + values = ['foo', 'fooo', 'fooooo', 'fooooooo'] + result = xr.DataArray(values).astype(dtype).str.len() + expected = xr.DataArray([len(x) for x in values]) + assert_equal(result, expected) + + +def test_find(dtype): + values = xr.DataArray(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF', 'XXX']) + values = values.astype(dtype) + result = values.str.find('EF') + assert_equal(result, xr.DataArray([4, 3, 1, 0, -1])) + expected = xr.DataArray([v.find(dtype('EF')) for v in values.values]) + assert_equal(result, expected) + + result = values.str.rfind('EF') + assert_equal(result, xr.DataArray([4, 5, 7, 4, -1])) + expected = xr.DataArray([v.rfind(dtype('EF')) for v in values.values]) + assert_equal(result, expected) + + result = values.str.find('EF', 3) + assert_equal(result, xr.DataArray([4, 3, 7, 4, -1])) + expected = xr.DataArray([v.find(dtype('EF'), 3) for v in values.values]) + assert_equal(result, expected) + + result = values.str.rfind('EF', 3) + assert_equal(result, xr.DataArray([4, 5, 7, 4, -1])) + expected = xr.DataArray([v.rfind(dtype('EF'), 3) for v in values.values]) + assert_equal(result, expected) + + result = values.str.find('EF', 3, 6) + assert_equal(result, xr.DataArray([4, 3, -1, 4, -1])) + expected = xr.DataArray([v.find(dtype('EF'), 3, 6) for v in values.values]) + assert_equal(result, expected) + + result = values.str.rfind('EF', 3, 6) + assert_equal(result, xr.DataArray([4, 3, -1, 4, -1])) + xp = xr.DataArray([v.rfind(dtype('EF'), 3, 6) for v in values.values]) + assert_equal(result, xp) + + +def test_index(dtype): + s = xr.DataArray(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', + 'EFGHEF']).astype(dtype) + + result = s.str.index('EF') + assert_equal(result, xr.DataArray([4, 3, 1, 0])) + + result = s.str.rindex('EF') + assert_equal(result, xr.DataArray([4, 5, 7, 4])) + + result = s.str.index('EF', 3) + assert_equal(result, xr.DataArray([4, 3, 7, 4])) + + result = s.str.rindex('EF', 3) + assert_equal(result, xr.DataArray([4, 5, 7, 4])) + + result = s.str.index('E', 4, 8) + assert_equal(result, xr.DataArray([4, 5, 7, 4])) + + result = s.str.rindex('E', 0, 5) + assert_equal(result, xr.DataArray([4, 3, 1, 4])) + + with pytest.raises(ValueError): + result = s.str.index('DE') + + +def test_pad(dtype): + values = xr.DataArray(['a', 'b', 'c', 'eeeee']).astype(dtype) + + result = values.str.pad(5, side='left') + expected = xr.DataArray([' a', ' b', ' c', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.pad(5, side='right') + expected = xr.DataArray(['a ', 'b ', 'c ', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.pad(5, side='both') + expected = xr.DataArray([' a ', ' b ', ' c ', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + +def test_pad_fillchar(dtype): + values = xr.DataArray(['a', 'b', 'c', 'eeeee']).astype(dtype) + + result = values.str.pad(5, side='left', fillchar='X') + expected = xr.DataArray(['XXXXa', 'XXXXb', 'XXXXc', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.pad(5, side='right', fillchar='X') + expected = xr.DataArray(['aXXXX', 'bXXXX', 'cXXXX', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.pad(5, side='both', fillchar='X') + expected = xr.DataArray(['XXaXX', 'XXbXX', 'XXcXX', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + msg = "fillchar must be a character, not str" + with pytest.raises(TypeError, match=msg): + result = values.str.pad(5, fillchar='XY') + + +def test_translate(): + values = xr.DataArray(['abcdefg', 'abcc', 'cdddfg', 'cdefggg']) + table = str.maketrans('abc', 'cde') + result = values.str.translate(table) + expected = xr.DataArray(['cdedefg', 'cdee', 'edddfg', 'edefggg']) + assert_equal(result, expected) + + +def test_center_ljust_rjust(dtype): + values = xr.DataArray(['a', 'b', 'c', 'eeeee']).astype(dtype) + + result = values.str.center(5) + expected = xr.DataArray([' a ', ' b ', ' c ', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.ljust(5) + expected = xr.DataArray(['a ', 'b ', 'c ', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.rjust(5) + expected = xr.DataArray([' a', ' b', ' c', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + +def test_center_ljust_rjust_fillchar(dtype): + values = xr.DataArray(['a', 'bb', 'cccc', 'ddddd', 'eeeeee']).astype(dtype) + result = values.str.center(5, fillchar='X') + expected = xr.DataArray(['XXaXX', 'XXbbX', 'Xcccc', 'ddddd', 'eeeeee']) + assert_equal(result, expected.astype(dtype)) + + result = values.str.ljust(5, fillchar='X') + expected = xr.DataArray(['aXXXX', 'bbXXX', 'ccccX', 'ddddd', 'eeeeee']) + assert_equal(result, expected.astype(dtype)) + + result = values.str.rjust(5, fillchar='X') + expected = xr.DataArray(['XXXXa', 'XXXbb', 'Xcccc', 'ddddd', 'eeeeee']) + assert_equal(result, expected.astype(dtype)) + + # If fillchar is not a charatter, normal str raises TypeError + # 'aaa'.ljust(5, 'XY') + # TypeError: must be char, not str + template = "fillchar must be a character, not {dtype}" + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.center(5, fillchar='XY') + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.ljust(5, fillchar='XY') + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.rjust(5, fillchar='XY') + + +def test_zfill(dtype): + values = xr.DataArray(['1', '22', 'aaa', '333', '45678']).astype(dtype) + + result = values.str.zfill(5) + expected = xr.DataArray(['00001', '00022', '00aaa', '00333', '45678']) + assert_equal(result, expected.astype(dtype)) + + result = values.str.zfill(3) + expected = xr.DataArray(['001', '022', 'aaa', '333', '45678']) + assert_equal(result, expected.astype(dtype)) + + +def test_slice(dtype): + arr = xr.DataArray(['aafootwo', 'aabartwo', 'aabazqux']).astype(dtype) + + result = arr.str.slice(2, 5) + exp = xr.DataArray(['foo', 'bar', 'baz']).astype(dtype) + assert_equal(result, exp) + + for start, stop, step in [(0, 3, -1), (None, None, -1), + (3, 10, 2), (3, 0, -1)]: + try: + result = arr.str[start:stop:step] + expected = xr.DataArray([s[start:stop:step] for s in arr.values]) + assert_equal(result, expected.astype(dtype)) + except IndexError: + print('failed on %s:%s:%s' % (start, stop, step)) + raise + + +def test_slice_replace(dtype): + da = lambda x: xr.DataArray(x).astype(dtype) + values = da(['short', 'a bit longer', 'evenlongerthanthat', '']) + + expected = da(['shrt', 'a it longer', 'evnlongerthanthat', '']) + result = values.str.slice_replace(2, 3) + assert_equal(result, expected) + + expected = da(['shzrt', 'a zit longer', 'evznlongerthanthat', 'z']) + result = values.str.slice_replace(2, 3, 'z') + assert_equal(result, expected) + + expected = da(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z']) + result = values.str.slice_replace(2, 2, 'z') + assert_equal(result, expected) + + expected = da(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z']) + result = values.str.slice_replace(2, 1, 'z') + assert_equal(result, expected) + + expected = da(['shorz', 'a bit longez', 'evenlongerthanthaz', 'z']) + result = values.str.slice_replace(-1, None, 'z') + assert_equal(result, expected) + + expected = da(['zrt', 'zer', 'zat', 'z']) + result = values.str.slice_replace(None, -2, 'z') + assert_equal(result, expected) + + expected = da(['shortz', 'a bit znger', 'evenlozerthanthat', 'z']) + result = values.str.slice_replace(6, 8, 'z') + assert_equal(result, expected) + + expected = da(['zrt', 'a zit longer', 'evenlongzerthanthat', 'z']) + result = values.str.slice_replace(-10, 3, 'z') + assert_equal(result, expected) + + +def test_strip_lstrip_rstrip(dtype): + values = xr.DataArray([' aa ', ' bb \n', 'cc ']).astype(dtype) + + result = values.str.strip() + expected = xr.DataArray(['aa', 'bb', 'cc']).astype(dtype) + assert_equal(result, expected) + + result = values.str.lstrip() + expected = xr.DataArray(['aa ', 'bb \n', 'cc ']).astype(dtype) + assert_equal(result, expected) + + result = values.str.rstrip() + expected = xr.DataArray([' aa', ' bb', 'cc']).astype(dtype) + assert_equal(result, expected) + + +def test_strip_lstrip_rstrip_args(dtype): + values = xr.DataArray(['xxABCxx', 'xx BNSD', 'LDFJH xx']).astype(dtype) + + rs = values.str.strip('x') + xp = xr.DataArray(['ABC', ' BNSD', 'LDFJH ']).astype(dtype) + assert_equal(rs, xp) + + rs = values.str.lstrip('x') + xp = xr.DataArray(['ABCxx', ' BNSD', 'LDFJH xx']).astype(dtype) + assert_equal(rs, xp) + + rs = values.str.rstrip('x') + xp = xr.DataArray(['xxABC', 'xx BNSD', 'LDFJH ']).astype(dtype) + assert_equal(rs, xp) + + +def test_wrap(): + # test values are: two words less than width, two words equal to width, + # two words greater than width, one word less than width, one word + # equal to width, one word greater than width, multiple tokens with + # trailing whitespace equal to width + values = xr.DataArray(['hello world', 'hello world!', 'hello world!!', + 'abcdefabcde', 'abcdefabcdef', 'abcdefabcdefa', + 'ab ab ab ab ', 'ab ab ab ab a', '\t']) + + # expected values + xp = xr.DataArray(['hello world', 'hello world!', 'hello\nworld!!', + 'abcdefabcde', 'abcdefabcdef', 'abcdefabcdef\na', + 'ab ab ab ab', 'ab ab ab ab\na', '']) + + rs = values.str.wrap(12, break_long_words=True) + assert_equal(rs, xp) + + # test with pre and post whitespace (non-unicode), NaN, and non-ascii + # Unicode + values = xr.DataArray([' pre ', '\xac\u20ac\U00008000 abadcafe']) + xp = xr.DataArray([' pre', '\xac\u20ac\U00008000 ab\nadcafe']) + rs = values.str.wrap(6) + assert_equal(rs, xp) + + +def test_get(dtype): + values = xr.DataArray(['a_b_c', 'c_d_e', 'f_g_h']).astype(dtype) + + result = values.str[2] + expected = xr.DataArray(['b', 'd', 'g']).astype(dtype) + assert_equal(result, expected) + + # bounds testing + values = xr.DataArray(['1_2_3_4_5', '6_7_8_9_10', '11_12']).astype(dtype) + + # positive index + result = values.str[5] + expected = xr.DataArray(['_', '_', '']).astype(dtype) + assert_equal(result, expected) + + # negative index + result = values.str[-6] + expected = xr.DataArray(['_', '8', '']).astype(dtype) + assert_equal(result, expected) + + +def test_encode_decode(): + data = xr.DataArray(['a', 'b', 'a\xe4']) + encoded = data.str.encode('utf-8') + decoded = encoded.str.decode('utf-8') + assert_equal(data, decoded) + + +def test_encode_decode_errors(): + encodeBase = xr.DataArray(['a', 'b', 'a\x9d']) + + msg = (r"'charmap' codec can't encode character '\\x9d' in position 1:" + " character maps to ") + with pytest.raises(UnicodeEncodeError, match=msg): + encodeBase.str.encode('cp1252') + + f = lambda x: x.encode('cp1252', 'ignore') + result = encodeBase.str.encode('cp1252', 'ignore') + expected = xr.DataArray([f(x) for x in encodeBase.values.tolist()]) + assert_equal(result, expected) + + decodeBase = xr.DataArray([b'a', b'b', b'a\x9d']) + + msg = ("'charmap' codec can't decode byte 0x9d in position 1:" + " character maps to ") + with pytest.raises(UnicodeDecodeError, match=msg): + decodeBase.str.decode('cp1252') + + f = lambda x: x.decode('cp1252', 'ignore') + result = decodeBase.str.decode('cp1252', 'ignore') + expected = xr.DataArray([f(x) for x in decodeBase.values.tolist()]) + assert_equal(result, expected) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 08347b5ed1b..89190eee590 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2,15 +2,15 @@ import itertools import math import os.path -from pathlib import Path import pickle import shutil import sys import tempfile -from typing import Optional import warnings from contextlib import ExitStack from io import BytesIO +from pathlib import Path +from typing import Optional import numpy as np import pandas as pd @@ -18,26 +18,26 @@ import xarray as xr from xarray import ( - DataArray, Dataset, backends, open_dataarray, open_dataset, open_mfdataset, - save_mfdataset) + DataArray, Dataset, backends, load_dataarray, load_dataset, open_dataarray, + open_dataset, open_mfdataset, save_mfdataset) from xarray.backends.common import robust_getitem from xarray.backends.netCDF4_ import _extract_nc4_variable_encoding from xarray.backends.pydap_ import PydapDataStore +from xarray.coding.variables import SerializationWarning from xarray.core import indexing from xarray.core.options import set_options from xarray.core.pycompat import dask_array_type from xarray.tests import mock -from xarray.coding.variables import SerializationWarning from . import ( assert_allclose, assert_array_equal, assert_equal, assert_identical, has_dask, has_netCDF4, has_scipy, network, raises_regex, requires_cfgrib, - requires_cftime, requires_dask, requires_h5netcdf, requires_netCDF4, - requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio, - requires_rasterio, requires_scipy, requires_scipy_or_netCDF4, - requires_zarr, requires_h5fileobj) -from .test_coding_times import (_STANDARD_CALENDARS, _NON_STANDARD_CALENDARS, - _ALL_CALENDARS) + requires_cftime, requires_dask, requires_h5fileobj, requires_h5netcdf, + requires_netCDF4, requires_pathlib, requires_pseudonetcdf, requires_pydap, + requires_pynio, requires_rasterio, requires_scipy, + requires_scipy_or_netCDF4, requires_zarr) +from .test_coding_times import ( + _ALL_CALENDARS, _NON_STANDARD_CALENDARS, _STANDARD_CALENDARS) from .test_dataset import create_test_data try: @@ -1134,6 +1134,18 @@ def test_encoding_kwarg_compression(self): assert ds.x.encoding == {} + def test_keep_chunksizes_if_no_original_shape(self): + ds = Dataset({'x': [1, 2, 3]}) + chunksizes = (2, ) + ds.variables['x'].encoding = { + 'chunksizes': chunksizes + } + + with self.roundtrip(ds) as actual: + assert_identical(ds, actual) + assert_array_equal(ds['x'].encoding['chunksizes'], + actual['x'].encoding['chunksizes']) + def test_encoding_chunksizes_unlimited(self): # regression test for GH1225 ds = Dataset({'x': [1, 2, 3], 'y': ('x', [2, 3, 4])}) @@ -2662,6 +2674,23 @@ def test_save_mfdataset_compute_false_roundtrip(self): with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(actual, original) + def test_load_dataset(self): + with create_tmp_file() as tmp: + original = Dataset({'foo': ('x', np.random.randn(10))}) + original.to_netcdf(tmp) + ds = load_dataset(tmp) + # this would fail if we used open_dataset instead of load_dataset + ds.to_netcdf(tmp) + + def test_load_dataarray(self): + with create_tmp_file() as tmp: + original = Dataset({'foo': ('x', np.random.randn(10))}) + original.to_netcdf(tmp) + ds = load_dataarray(tmp) + # this would fail if we used open_dataarray instead of + # load_dataarray + ds.to_netcdf(tmp) + @requires_scipy_or_netCDF4 @requires_pydap @@ -3529,6 +3558,11 @@ def test_extract_nc4_variable_encoding(self): encoding = _extract_nc4_variable_encoding(var, raise_on_invalid=True) assert {'shuffle': True} == encoding + # Variables with unlim dims must be chunked on output. + var = xr.Variable(('x',), [1, 2, 3], {}, {'contiguous': True}) + encoding = _extract_nc4_variable_encoding(var, unlimited_dims=('x',)) + assert {} == encoding + def test_extract_h5nc_encoding(self): # not supported with h5netcdf (yet) var = xr.Variable(('x',), [1, 2, 3], {}, diff --git a/xarray/tests/test_backends_lru_cache.py b/xarray/tests/test_backends_lru_cache.py index d64d718f2f7..aa97f5fb4cb 100644 --- a/xarray/tests/test_backends_lru_cache.py +++ b/xarray/tests/test_backends_lru_cache.py @@ -1,4 +1,5 @@ from unittest import mock + import pytest from xarray.backends.lru_cache import LRUCache diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 1cf257c96eb..b3560fe3039 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -6,8 +6,8 @@ from xarray import CFTimeIndex from xarray.coding.cftime_offsets import ( - _MONTH_ABBREVIATIONS, BaseCFTimeOffset, Day, Hour, Minute, Second, - MonthBegin, MonthEnd, YearBegin, YearEnd, QuarterBegin, QuarterEnd, + _MONTH_ABBREVIATIONS, BaseCFTimeOffset, Day, Hour, Minute, MonthBegin, + MonthEnd, QuarterBegin, QuarterEnd, Second, YearBegin, YearEnd, _days_in_month, cftime_range, get_date_type, to_cftime_datetime, to_offset) cftime = pytest.importorskip('cftime') diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index c5cdf0a3fee..56c01fbdc28 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -8,7 +8,7 @@ from xarray.coding.cftimeindex import ( CFTimeIndex, _parse_array_of_cftime_strings, _parse_iso8601_with_reso, _parsed_string_to_bounds, assert_all_valid_date_type, parse_iso8601) -from xarray.tests import assert_array_equal, assert_allclose, assert_identical +from xarray.tests import assert_array_equal, assert_identical from . import ( has_cftime, has_cftime_1_0_2_1, has_cftime_or_netCDF4, raises_regex, @@ -785,6 +785,17 @@ def test_parse_array_of_cftime_strings(): np.testing.assert_array_equal(result, expected) +@pytest.mark.skipif(not has_cftime, reason='cftime not installed') +@pytest.mark.parametrize('calendar', _ALL_CALENDARS) +def test_strftime_of_cftime_array(calendar): + date_format = '%Y%m%d%H%M' + cf_values = xr.cftime_range('2000', periods=5, calendar=calendar) + dt_values = pd.date_range('2000', periods=5) + expected = dt_values.strftime(date_format) + result = cf_values.strftime(date_format) + assert result.equals(expected) + + @pytest.mark.skipif(not has_cftime, reason='cftime not installed') @pytest.mark.parametrize('calendar', _ALL_CALENDARS) @pytest.mark.parametrize('unsafe', [False, True]) diff --git a/xarray/tests/test_cftimeindex_resample.py b/xarray/tests/test_cftimeindex_resample.py index 7aca4492680..108b303e0c0 100644 --- a/xarray/tests/test_cftimeindex_resample.py +++ b/xarray/tests/test_cftimeindex_resample.py @@ -1,12 +1,12 @@ -import pytest - import datetime + import numpy as np import pandas as pd +import pytest + import xarray as xr from xarray.core.resample_cftime import CFTimeGrouper - pytest.importorskip('cftime') pytest.importorskip('pandas', minversion='0.24') diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 95c8ebc0b42..9f937ac7f5e 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -6,7 +6,7 @@ import xarray as xr from xarray.coding import variables -from . import assert_identical, requires_dask +from . import assert_equal, assert_identical, requires_dask with suppress(ImportError): import dask.array as da @@ -20,6 +20,23 @@ def test_CFMaskCoder_decode(): assert_identical(expected, encoded) +def test_CFMaskCoder_missing_value(): + expected = xr.DataArray(np.array([[26915, 27755, -9999, 27705], + [25595, -9999, 28315, -9999]]), + dims=['npts', 'ntimes'], + name='tmpk') + expected.attrs['missing_value'] = -9999 + + decoded = xr.decode_cf(expected.to_dataset()) + encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs) + + assert_equal(encoded['tmpk'], expected.variable) + + decoded.tmpk.encoding['_FillValue'] = -9940 + with pytest.raises(ValueError): + encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs) + + @requires_dask def test_CFMaskCoder_decode_dask(): original = xr.Variable(('x',), [0, -1, 1], {'_FillValue': -1}).chunk() diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 1d8ed169d29..e9b63dd18fc 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -7,14 +7,15 @@ import pytest from xarray import DataArray, Dataset, Variable, auto_combine, concat +from xarray.core import dtypes from xarray.core.combine import ( _auto_combine, _auto_combine_1d, _auto_combine_all_along_first_dim, _check_shape_tile_ids, _combine_nd, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, _new_tile_id) from . import ( - InaccessibleArray, assert_array_equal, - assert_equal, assert_identical, raises_regex, requires_dask) + InaccessibleArray, assert_array_equal, assert_equal, assert_identical, + raises_regex, requires_dask) from .test_dataset import create_test_data @@ -237,6 +238,20 @@ def test_concat_multiindex(self): assert expected.equals(actual) assert isinstance(actual.x.to_index(), pd.MultiIndex) + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_concat_fill_value(self, fill_value): + datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), + Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = Dataset({'a': (('t', 'x'), + [[fill_value, 2, 3], [1, 2, fill_value]])}, + {'x': [0, 1, 2]}) + actual = concat(datasets, dim='t', fill_value=fill_value) + assert_identical(actual, expected) + class TestConcatDataArray: def test_concat(self): @@ -306,6 +321,19 @@ def test_concat_lazy(self): assert combined.shape == (2, 3, 3) assert combined.dims == ('z', 'x', 'y') + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_concat_fill_value(self, fill_value): + foo = DataArray([1, 2], coords=[('x', [1, 2])]) + bar = DataArray([1, 2], coords=[('x', [1, 3])]) + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = DataArray([[1, 2, fill_value], [1, fill_value, 2]], + dims=['y', 'x'], coords={'x': [1, 2, 3]}) + actual = concat((foo, bar), dim='y', fill_value=fill_value) + assert_identical(actual, expected) + class TestAutoCombine: @@ -417,6 +445,20 @@ def test_auto_combine_no_concat(self): {'baz': [100]}) assert_identical(expected, actual) + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_auto_combine_fill_value(self, fill_value): + datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), + Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = Dataset({'a': (('t', 'x'), + [[fill_value, 2, 3], [1, 2, fill_value]])}, + {'x': [0, 1, 2]}) + actual = auto_combine(datasets, concat_dim='t', fill_value=fill_value) + assert_identical(expected, actual) + def assert_combined_tile_ids_equal(dict1, dict2): assert len(dict1) == len(dict2) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 9471ec144c0..b7235629d7a 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1,9 +1,9 @@ import pickle +import sys import warnings from collections import OrderedDict from copy import deepcopy from textwrap import dedent -import sys import numpy as np import pandas as pd @@ -12,7 +12,7 @@ import xarray as xr from xarray import ( DataArray, Dataset, IndexVariable, Variable, align, broadcast) -from xarray.coding.times import CFDatetimeCoder, _import_cftime +from xarray.coding.times import CFDatetimeCoder from xarray.convert import from_cdms2 from xarray.core import dtypes from xarray.core.common import ALL_DIMS, full_like @@ -20,7 +20,7 @@ LooseVersion, ReturnItem, assert_allclose, assert_array_equal, assert_equal, assert_identical, raises_regex, requires_bottleneck, requires_cftime, requires_dask, requires_iris, requires_np113, - requires_scipy, source_ndarray) + requires_numbagg, requires_scipy, source_ndarray) class TestDataArray: @@ -1177,7 +1177,7 @@ def test_reset_coords(self): dims=['x', 'y'], name='foo') assert_identical(actual, expected) - with pytest.warns(FutureWarning, message='The inplace argument'): + with pytest.warns(FutureWarning, match='The inplace argument'): with raises_regex(ValueError, 'cannot reset coord'): data = data.reset_coords(inplace=True) with raises_regex(ValueError, 'cannot be found'): @@ -1259,18 +1259,6 @@ def test_reindex_like_no_index(self): ValueError, 'different size for unlabeled'): foo.reindex_like(bar) - @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) - def test_reindex_fill_value(self, fill_value): - foo = DataArray([10, 20], dims='y', coords={'y': [0, 1]}) - bar = DataArray([10, 20, 30], dims='y', coords={'y': [0, 1, 2]}) - if fill_value == dtypes.NA: - # if we supply the default, we expect the missing value for a - # float array - fill_value = np.nan - actual = x.reindex_like(bar, fill_value=fill_value) - expected = DataArray([10, 20, fill_value], coords=[('y', [0, 1, 2])]) - assert_identical(expected, actual) - @pytest.mark.filterwarnings('ignore:Indexer has dimensions') def test_reindex_regressions(self): # regression test for #279 @@ -1540,7 +1528,7 @@ def test_reorder_levels(self): obj = self.mda.reorder_levels(x=['level_2', 'level_1']) assert_identical(obj, expected) - with pytest.warns(FutureWarning, message='The inplace argument'): + with pytest.warns(FutureWarning, match='The inplace argument'): array = self.mda.copy() array.reorder_levels(x=['level_2', 'level_1'], inplace=True) assert_identical(array, expected) @@ -1644,8 +1632,8 @@ def test_math_name(self): assert (a + a.rename(None)).name is None assert (a + a.rename('bar')).name is None assert (a + a).name == 'foo' - assert (+a['x']).name is 'x' - assert (a['x'] + 0).name is 'x' + assert (+a['x']).name == 'x' + assert (a['x'] + 0).name == 'x' assert (a + a['x']).name is None def test_math_with_coords(self): @@ -1681,14 +1669,14 @@ def test_math_with_coords(self): assert_identical(expected, actual) actual = orig[0, :] + orig[:, 0] - assert_identical(expected.T, actual) + assert_identical(expected.transpose(transpose_coords=True), actual) - actual = orig - orig.T + actual = orig - orig.transpose(transpose_coords=True) expected = DataArray(np.zeros((2, 3)), orig.coords) assert_identical(expected, actual) - actual = orig.T - orig - assert_identical(expected.T, actual) + actual = orig.transpose(transpose_coords=True) - orig + assert_identical(expected.transpose(transpose_coords=True), actual) alt = DataArray([1, 1], {'x': [-1, -2], 'c': 'foo', 'd': 555}, 'x') actual = orig + alt @@ -1760,6 +1748,16 @@ def test_stack_unstack(self): orig = DataArray([[0, 1], [2, 3]], dims=['x', 'y'], attrs={'foo': 2}) assert_identical(orig, orig.unstack()) + # test GH3000 + a = orig[:0, :1].stack(dim=('x', 'y')).dim.to_index() + if pd.__version__ < '0.24.0': + b = pd.MultiIndex(levels=[pd.Int64Index([]), pd.Int64Index([0])], + labels=[[], []], names=['x', 'y']) + else: + b = pd.MultiIndex(levels=[pd.Int64Index([]), pd.Int64Index([0])], + codes=[[], []], names=['x', 'y']) + pd.util.testing.assert_index_equal(a, b) + actual = orig.stack(z=['x', 'y']).unstack('z').drop(['x', 'y']) assert_identical(orig, actual) @@ -1801,8 +1799,27 @@ def test_stack_nonunique_consistency(self): assert_identical(expected, actual) def test_transpose(self): - assert_equal(self.dv.variable.transpose(), - self.dv.transpose().variable) + da = DataArray(np.random.randn(3, 4, 5), dims=('x', 'y', 'z'), + coords={'x': range(3), 'y': range(4), 'z': range(5), + 'xy': (('x', 'y'), np.random.randn(3, 4))}) + + actual = da.transpose(transpose_coords=False) + expected = DataArray(da.values.T, dims=('z', 'y', 'x'), + coords=da.coords) + assert_equal(expected, actual) + + actual = da.transpose('z', 'y', 'x', transpose_coords=True) + expected = DataArray(da.values.T, dims=('z', 'y', 'x'), + coords={'x': da.x.values, 'y': da.y.values, + 'z': da.z.values, + 'xy': (('y', 'x'), da.xy.values.T)}) + assert_equal(expected, actual) + + with pytest.raises(ValueError): + da.transpose('x', 'y') + + with pytest.warns(FutureWarning): + da.transpose() def test_squeeze(self): assert_equal(self.dv.variable.squeeze(), self.dv.squeeze().variable) @@ -1842,19 +1859,34 @@ def test_drop_coordinates(self): with pytest.raises(ValueError): arr.drop('not found') + actual = expected.drop('not found', errors='ignore') + assert_identical(actual, expected) + with raises_regex(ValueError, 'cannot be found'): arr.drop(None) + actual = expected.drop(None, errors='ignore') + assert_identical(actual, expected) + renamed = arr.rename('foo') with raises_regex(ValueError, 'cannot be found'): renamed.drop('foo') + actual = renamed.drop('foo', errors='ignore') + assert_identical(actual, renamed) + def test_drop_index_labels(self): arr = DataArray(np.random.randn(2, 3), coords={'y': [0, 1, 2]}, dims=['x', 'y']) actual = arr.drop([0, 1], dim='y') expected = arr[:, 2:] - assert_identical(expected, actual) + assert_identical(actual, expected) + + with raises_regex((KeyError, ValueError), 'not .* in axis'): + actual = arr.drop([0, 1, 3], dim='y') + + actual = arr.drop([0, 1, 3], dim='y', errors='ignore') + assert_identical(actual, expected) def test_dropna(self): x = np.random.randn(4, 4) @@ -1959,6 +1991,44 @@ def test_reduce(self): dims=['x', 'y']).mean('x') assert_equal(actual, expected) + def test_reduce_keepdims(self): + coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], + 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), + 'c': -999} + orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords, dims=['x', 'y']) + + # Mean on all axes loses non-constant coordinates + actual = orig.mean(keepdims=True) + expected = DataArray(orig.data.mean(keepdims=True), dims=orig.dims, + coords={k: v for k, v in coords.items() + if k in ['c']}) + assert_equal(actual, expected) + + assert actual.sizes['x'] == 1 + assert actual.sizes['y'] == 1 + + # Mean on specific axes loses coordinates not involving that axis + actual = orig.mean('y', keepdims=True) + expected = DataArray(orig.data.mean(axis=1, keepdims=True), + dims=orig.dims, + coords={k: v for k, v in coords.items() + if k not in ['y', 'lat']}) + assert_equal(actual, expected) + + @requires_bottleneck + def test_reduce_keepdims_bottleneck(self): + import bottleneck + + coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], + 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), + 'c': -999} + orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords, dims=['x', 'y']) + + # Bottleneck does not have its own keepdims implementation + actual = orig.reduce(bottleneck.nanmean, keepdims=True) + expected = orig.mean(keepdims=True) + assert_equal(actual, expected) + def test_reduce_dtype(self): coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), @@ -2258,6 +2328,23 @@ def test_groupby_restore_dim_order(self): result = array.groupby(by).apply(lambda x: x.squeeze()) assert result.dims == expected_dims + def test_groupby_restore_coord_dims(self): + array = DataArray(np.random.randn(5, 3), + coords={'a': ('x', range(5)), 'b': ('y', range(3)), + 'c': (('x', 'y'), np.random.randn(5, 3))}, + dims=['x', 'y']) + + for by, expected_dims in [('x', ('x', 'y')), + ('y', ('x', 'y')), + ('a', ('a', 'y')), + ('b', ('x', 'b'))]: + result = array.groupby(by, restore_coord_dims=True).apply( + lambda x: x.squeeze())['c'] + assert result.dims == expected_dims + + with pytest.warns(FutureWarning): + array.groupby('x').apply(lambda x: x.squeeze()) + def test_groupby_first_and_last(self): array = DataArray([1, 2, 3, 4, 5], dims='x') by = DataArray(['a'] * 2 + ['b'] * 3, dims='x', name='ab') @@ -2445,15 +2532,18 @@ def test_resample_drop_nondim_coords(self): array = ds['data'] # Re-sample - actual = array.resample(time="12H").mean('time') + actual = array.resample( + time="12H", restore_coord_dims=True).mean('time') assert 'tc' not in actual.coords # Up-sample - filling - actual = array.resample(time="1H").ffill() + actual = array.resample( + time="1H", restore_coord_dims=True).ffill() assert 'tc' not in actual.coords # Up-sample - interpolation - actual = array.resample(time="1H").interpolate('linear') + actual = array.resample( + time="1H", restore_coord_dims=True).interpolate('linear') assert 'tc' not in actual.coords def test_resample_keep_attrs(self): @@ -3417,6 +3507,19 @@ def test_dot(self): with pytest.raises(TypeError): da.dot(dm.values) + def test_matmul(self): + + # copied from above (could make a fixture) + x = np.linspace(-3, 3, 6) + y = np.linspace(-3, 3, 5) + z = range(4) + da_vals = np.arange(6 * 5 * 4).reshape((6, 5, 4)) + da = DataArray(da_vals, coords=[x, y, z], dims=['x', 'y', 'z']) + + result = da @ da + expected = da.dot(da) + assert_identical(result, expected) + def test_binary_op_join_setting(self): dim = 'x' align_type = "outer" @@ -3854,14 +3957,14 @@ def test_to_and_from_iris(self): assert coord.var_name == original_coord.name assert_array_equal( coord.points, CFDatetimeCoder().encode(original_coord).values) - assert (actual.coord_dims(coord) == - original.get_axis_num( + assert (actual.coord_dims(coord) + == original.get_axis_num( original.coords[coord.var_name].dims)) - assert (actual.coord('distance2').attributes['foo'] == - original.coords['distance2'].attrs['foo']) - assert (actual.coord('distance').units == - cf_units.Unit(original.coords['distance'].units)) + assert (actual.coord('distance2').attributes['foo'] + == original.coords['distance2'].attrs['foo']) + assert (actual.coord('distance').units + == cf_units.Unit(original.coords['distance'].units)) assert actual.attributes['baz'] == original.attrs['baz'] assert actual.standard_name == original.attrs['standard_name'] @@ -3919,14 +4022,14 @@ def test_to_and_from_iris_dask(self): assert coord.var_name == original_coord.name assert_array_equal( coord.points, CFDatetimeCoder().encode(original_coord).values) - assert (actual.coord_dims(coord) == - original.get_axis_num( + assert (actual.coord_dims(coord) + == original.get_axis_num( original.coords[coord.var_name].dims)) assert (actual.coord('distance2').attributes['foo'] == original.coords[ 'distance2'].attrs['foo']) - assert (actual.coord('distance').units == - cf_units.Unit(original.coords['distance'].units)) + assert (actual.coord('distance').units + == cf_units.Unit(original.coords['distance'].units)) assert actual.attributes['baz'] == original.attrs['baz'] assert actual.standard_name == original.attrs['standard_name'] @@ -4022,3 +4125,30 @@ def test_fallback_to_iris_AuxCoord(self, coord_values): expected = Cube(data, aux_coords_and_dims=[ (AuxCoord(coord_values, var_name='space'), 0)]) assert result == expected + + +@requires_numbagg +@pytest.mark.parametrize('dim', ['time', 'x']) +@pytest.mark.parametrize('window_type, window', [ + ['span', 5], + ['alpha', 0.5], + ['com', 0.5], + ['halflife', 5], +]) +def test_rolling_exp(da, dim, window_type, window): + da = da.isel(a=0) + da = da.where(da > 0.2) + + result = da.rolling_exp(window_type=window_type, **{dim: window}).mean() + assert isinstance(result, DataArray) + + pandas_array = da.to_pandas() + assert pandas_array.index.name == 'time' + if dim == 'x': + pandas_array = pandas_array.T + expected = ( + xr.DataArray(pandas_array.ewm(**{window_type: window}).mean()) + .transpose(*da.dims) + ) + + assert_allclose(expected.variable, result.variable) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index b47e26328ad..1265f6a337a 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -22,8 +22,8 @@ from . import ( InaccessibleArray, UnexpectedDataAccess, assert_allclose, assert_array_equal, assert_equal, assert_identical, has_cftime, has_dask, - raises_regex, requires_bottleneck, requires_dask, requires_scipy, - source_ndarray, requires_cftime) + raises_regex, requires_bottleneck, requires_cftime, requires_dask, + requires_numbagg, requires_scipy, source_ndarray) try: import dask.array as da @@ -1889,6 +1889,15 @@ def test_drop_variables(self): with raises_regex(ValueError, 'cannot be found'): data.drop('not_found_here') + actual = data.drop('not_found_here', errors='ignore') + assert_identical(data, actual) + + actual = data.drop(['not_found_here'], errors='ignore') + assert_identical(data, actual) + + actual = data.drop(['time', 'not_found_here'], errors='ignore') + assert_identical(expected, actual) + def test_drop_index_labels(self): data = Dataset({'A': (['x', 'y'], np.random.randn(2, 3)), 'x': ['a', 'b']}) @@ -1907,6 +1916,16 @@ def test_drop_index_labels(self): # not contained in axis data.drop(['c'], dim='x') + actual = data.drop(['c'], dim='x', errors='ignore') + assert_identical(data, actual) + + with pytest.raises(ValueError): + data.drop(['c'], dim='x', errors='wrong_value') + + actual = data.drop(['a', 'b', 'c'], 'x', errors='ignore') + expected = data.isel(x=slice(0, 0)) + assert_identical(expected, actual) + with raises_regex( ValueError, 'does not have coordinate labels'): data.drop(1, 'y') @@ -1931,6 +1950,22 @@ def test_drop_dims(self): with pytest.raises((ValueError, KeyError)): data.drop_dims('z') # not a dimension + with pytest.raises((ValueError, KeyError)): + data.drop_dims(None) + + actual = data.drop_dims('z', errors='ignore') + assert_identical(data, actual) + + actual = data.drop_dims(None, errors='ignore') + assert_identical(data, actual) + + with pytest.raises(ValueError): + actual = data.drop_dims('z', errors='wrong_value') + + actual = data.drop_dims(['x', 'y', 'z'], errors='ignore') + expected = data.drop(['A', 'B', 'x']) + assert_identical(expected, actual) + def test_copy(self): data = create_test_data() data.attrs['Test'] = [1, 2, 3] @@ -2258,7 +2293,7 @@ def test_set_index(self): obj = ds.set_index(x=mindex.names) assert_identical(obj, expected) - with pytest.warns(FutureWarning, message='The inplace argument'): + with pytest.warns(FutureWarning, match='The inplace argument'): ds.set_index(x=mindex.names, inplace=True) assert_identical(ds, expected) @@ -2278,7 +2313,7 @@ def test_reset_index(self): obj = ds.reset_index('x') assert_identical(obj, expected) - with pytest.warns(FutureWarning, message='The inplace argument'): + with pytest.warns(FutureWarning, match='The inplace argument'): ds.reset_index('x', inplace=True) assert_identical(ds, expected) @@ -2291,7 +2326,7 @@ def test_reorder_levels(self): reindexed = ds.reorder_levels(x=['level_2', 'level_1']) assert_identical(reindexed, expected) - with pytest.warns(FutureWarning, message='The inplace argument'): + with pytest.warns(FutureWarning, match='The inplace argument'): ds.reorder_levels(x=['level_2', 'level_1'], inplace=True) assert_identical(ds, expected) @@ -2375,7 +2410,7 @@ def test_update(self): assert actual_result is actual assert_identical(expected, actual) - with pytest.warns(FutureWarning, message='The inplace argument'): + with pytest.warns(FutureWarning, match='The inplace argument'): actual = data.update(data, inplace=False) expected = data assert actual is not expected @@ -2752,6 +2787,11 @@ def test_delitem(self): assert set(data.variables) == all_items - set(['var1', 'numbers']) assert 'numbers' not in data.coords + expected = Dataset() + actual = Dataset({'y': ('x', [1, 2])}) + del actual['y'] + assert_identical(expected, actual) + def test_squeeze(self): data = Dataset({'foo': (['x', 'y', 'z'], [[[1], [2]]])}) for args in [[], [['x']], [['x', 'z']]]: @@ -3858,6 +3898,25 @@ def total_sum(x): with raises_regex(TypeError, "unexpected keyword argument 'axis'"): ds.reduce(total_sum, dim='x') + def test_reduce_keepdims(self): + ds = Dataset({'a': (['x', 'y'], [[0, 1, 2, 3, 4]])}, + coords={'y': [0, 1, 2, 3, 4], 'x': [0], + 'lat': (['x', 'y'], [[0, 1, 2, 3, 4]]), + 'c': -999.0}) + + # Shape should match behaviour of numpy reductions with keepdims=True + # Coordinates involved in the reduction should be removed + actual = ds.mean(keepdims=True) + expected = Dataset({'a': (['x', 'y'], np.mean(ds.a, keepdims=True))}, + coords={'c': ds.c}) + assert_identical(expected, actual) + + actual = ds.mean('x', keepdims=True) + expected = Dataset({'a': (['x', 'y'], + np.mean(ds.a, axis=0, keepdims=True))}, + coords={'y': ds.y, 'c': ds.c}) + assert_identical(expected, actual) + def test_quantile(self): ds = create_test_data(seed=123) @@ -4062,14 +4121,20 @@ def test_dataset_math_errors(self): def test_dataset_transpose(self): ds = Dataset({'a': (('x', 'y'), np.random.randn(3, 4)), - 'b': (('y', 'x'), np.random.randn(4, 3))}) + 'b': (('y', 'x'), np.random.randn(4, 3))}, + coords={'x': range(3), 'y': range(4), + 'xy': (('x', 'y'), np.random.randn(3, 4))}) actual = ds.transpose() - expected = ds.apply(lambda x: x.transpose()) + expected = Dataset({'a': (('y', 'x'), ds.a.values.T), + 'b': (('x', 'y'), ds.b.values.T)}, + coords={'x': ds.x.values, 'y': ds.y.values, + 'xy': (('y', 'x'), ds.xy.values.T)}) assert_identical(expected, actual) actual = ds.transpose('x', 'y') - expected = ds.apply(lambda x: x.transpose('x', 'y')) + expected = ds.apply( + lambda x: x.transpose('x', 'y', transpose_coords=True)) assert_identical(expected, actual) ds = create_test_data() @@ -4609,7 +4674,7 @@ def test_dataset_constructor_aligns_to_explicit_coords( def test_error_message_on_set_supplied(): - with pytest.raises(TypeError, message='has invalid type set'): + with pytest.raises(TypeError, match="has invalid type "): xr.Dataset(dict(date=[1, 2, 3], sec={4})) @@ -4619,7 +4684,7 @@ def test_error_message_on_set_supplied(): def test_constructor_raises_with_invalid_coords(unaligned_coords): with pytest.raises(ValueError, - message='not a subset of the DataArray dimensions'): + match='not a subset of the DataArray dimensions'): xr.DataArray([1, 2, 3], dims=['x'], coords=unaligned_coords) @@ -4750,9 +4815,9 @@ def test_rolling_wrapped_bottleneck(ds, name, center, min_periods, key): func_name = 'move_{0}'.format(name) actual = getattr(rolling_obj, name)() - if key is 'z1': # z1 does not depend on 'Time' axis. Stored as it is. + if key == 'z1': # z1 does not depend on 'Time' axis. Stored as it is. expected = ds[key] - elif key is 'z2': + elif key == 'z2': expected = getattr(bn, func_name)(ds[key].values, window=7, axis=0, min_count=min_periods) assert_array_equal(actual[key].values, expected) @@ -4763,6 +4828,13 @@ def test_rolling_wrapped_bottleneck(ds, name, center, min_periods, key): assert_equal(actual, ds['time']) +@requires_numbagg +def test_rolling_exp(ds): + + result = ds.rolling_exp(time=10, window_type='span').mean() + assert isinstance(result, Dataset) + + @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('min_periods', (None, 1, 2, 3)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) diff --git a/xarray/tests/test_distributed.py b/xarray/tests/test_distributed.py index 17f655cef8d..98c53ef2b12 100644 --- a/xarray/tests/test_distributed.py +++ b/xarray/tests/test_distributed.py @@ -1,4 +1,5 @@ """ isort:skip_file """ +# flake8: noqa: E402 - ignore linters re order of imports import pickle import pytest @@ -28,6 +29,7 @@ da = pytest.importorskip('dask.array') +loop = loop # loop is an imported fixture, which flake8 has issues ack-ing @pytest.fixture diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 75ab5f52a1b..87a7a2863d3 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -7,13 +7,13 @@ import pytest from numpy import array, nan -from xarray import DataArray, Dataset, concat, cftime_range +from xarray import DataArray, Dataset, cftime_range, concat from xarray.core import dtypes, duck_array_ops from xarray.core.duck_array_ops import ( array_notnull_equiv, concatenate, count, first, gradient, last, mean, rolling_window, stack, where) from xarray.core.pycompat import dask_array_type -from xarray.testing import assert_allclose, assert_equal, assert_identical +from xarray.testing import assert_allclose, assert_equal from . import ( assert_array_equal, has_dask, has_np113, raises_regex, requires_cftime, diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index b623c9bf05d..5433bd00f9d 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -105,4 +105,64 @@ def func(arg1, arg2, arg3=0): assert_identical(expected, actual) +def test_da_groupby_quantile(): + + array = xr.DataArray([1, 2, 3, 4, 5, 6], + [('x', [1, 1, 1, 2, 2, 2])]) + + # Scalar quantile + expected = xr.DataArray([2, 5], [('x', [1, 2])]) + actual = array.groupby('x').quantile(.5) + assert_identical(expected, actual) + + # Vector quantile + expected = xr.DataArray([[1, 3], [4, 6]], + [('x', [1, 2]), ('quantile', [0, 1])]) + actual = array.groupby('x').quantile([0, 1]) + assert_identical(expected, actual) + + # Multiple dimensions + array = xr.DataArray([[1, 11, 26], [2, 12, 22], [3, 13, 23], + [4, 16, 24], [5, 15, 25]], + [('x', [1, 1, 1, 2, 2],), + ('y', [0, 0, 1])]) + + actual_x = array.groupby('x').quantile(0) + expected_x = xr.DataArray([1, 4], + [('x', [1, 2]), ]) + assert_identical(expected_x, actual_x) + + actual_y = array.groupby('y').quantile(0) + expected_y = xr.DataArray([1, 22], + [('y', [0, 1]), ]) + assert_identical(expected_y, actual_y) + + actual_xx = array.groupby('x').quantile(0, dim='x') + expected_xx = xr.DataArray([[1, 11, 22], [4, 15, 24]], + [('x', [1, 2]), ('y', [0, 0, 1])]) + assert_identical(expected_xx, actual_xx) + + actual_yy = array.groupby('y').quantile(0, dim='y') + expected_yy = xr.DataArray([[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]], + [('x', [1, 1, 1, 2, 2]), ('y', [0, 1])]) + assert_identical(expected_yy, actual_yy) + + times = pd.date_range('2000-01-01', periods=365) + x = [0, 1] + foo = xr.DataArray(np.reshape(np.arange(365 * 2), (365, 2)), + coords=dict(time=times, x=x), dims=('time', 'x')) + g = foo.groupby(foo.time.dt.month) + + actual = g.quantile(0) + expected = xr.DataArray([0., 62., 120., 182., 242., 304., + 364., 426., 488., 548., 610., 670.], + [('month', np.arange(1, 13))]) + assert_identical(expected, actual) + + actual = g.quantile(0, dim='time')[:2] + expected = xr.DataArray([[0., 1], [62., 63]], + [('month', [1, 2]), ('x', [0, 1])]) + assert_identical(expected, actual) + + # TODO: move other groupby tests from test_dataset and test_dataarray over here diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 9301abb5e32..59435fea88b 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -505,13 +505,20 @@ def test_decompose_indexers(shape, indexer_mode, indexing_support): def test_implicit_indexing_adapter(): - array = np.arange(10) + array = np.arange(10, dtype=np.int64) implicit = indexing.ImplicitToExplicitIndexingAdapter( indexing.NumpyIndexingAdapter(array), indexing.BasicIndexer) np.testing.assert_array_equal(array, np.asarray(implicit)) np.testing.assert_array_equal(array, implicit[:]) +def test_implicit_indexing_adapter_copy_on_write(): + array = np.arange(10, dtype=np.int64) + implicit = indexing.ImplicitToExplicitIndexingAdapter( + indexing.CopyOnWriteArray(array)) + assert isinstance(implicit[:], indexing.ImplicitToExplicitIndexingAdapter) + + def test_outer_indexer_consistency_with_broadcast_indexes_vectorized(): def nonzero(x): if isinstance(x, np.ndarray) and x.dtype.kind == 'b': diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index 8347d54bd1e..252f8bcacd4 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -6,8 +6,8 @@ from xarray.tests import ( assert_allclose, assert_equal, requires_cftime, requires_scipy) -from . import has_dask, has_scipy from ..coding.cftimeindex import _parse_array_of_cftime_strings +from . import has_dask, has_scipy from .test_dataset import create_test_data try: @@ -143,7 +143,8 @@ def func(obj, dim, new_x): 'y': da['y'], 'x': ('z', xdest.values), 'x2': ('z', func(da['x2'], 'x', xdest))}) - assert_allclose(actual, expected.transpose('z', 'y')) + assert_allclose(actual, + expected.transpose('z', 'y', transpose_coords=True)) # xdest is 2d xdest = xr.DataArray(np.linspace(0.1, 0.9, 30).reshape(6, 5), @@ -160,7 +161,8 @@ def func(obj, dim, new_x): coords={'z': xdest['z'], 'w': xdest['w'], 'z2': xdest['z2'], 'y': da['y'], 'x': (('z', 'w'), xdest), 'x2': (('z', 'w'), func(da['x2'], 'x', xdest))}) - assert_allclose(actual, expected.transpose('z', 'w', 'y')) + assert_allclose(actual, + expected.transpose('z', 'w', 'y', transpose_coords=True)) @pytest.mark.parametrize('case', [3, 4]) diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index 0d76db1d1ee..20e0fae8daf 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -2,7 +2,7 @@ import pytest import xarray as xr -from xarray.core import merge, dtypes +from xarray.core import dtypes, merge from . import raises_regex from .test_dataset import create_test_data @@ -67,6 +67,15 @@ def test_merge_alignment_error(self): with raises_regex(ValueError, 'indexes .* not equal'): xr.merge([ds, other], join='exact') + def test_merge_wrong_input_error(self): + with raises_regex(TypeError, "objects must be an iterable"): + xr.merge([1]) + ds = xr.Dataset(coords={'x': [1, 2]}) + with raises_regex(TypeError, "objects must be an iterable"): + xr.merge({'a': ds}) + with raises_regex(TypeError, "objects must be an iterable"): + xr.merge([ds, 1]) + def test_merge_no_conflicts_single_var(self): ds1 = xr.Dataset({'a': ('x', [1, 2]), 'x': [0, 1]}) ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}) diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 759a2974ca6..0dc5fb320f0 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -8,16 +8,16 @@ import xarray as xr import xarray.plot as xplt from xarray import DataArray +from xarray.coding.times import _import_cftime from xarray.plot.plot import _infer_interval_breaks from xarray.plot.utils import ( _build_discrete_cmap, _color_palette, _determine_cmap_params, import_seaborn, label_from_attrs) from . import ( - assert_array_equal, assert_equal, raises_regex, requires_cftime, - requires_matplotlib, requires_matplotlib2, requires_seaborn, - requires_nc_time_axis) -from . import has_nc_time_axis + assert_array_equal, assert_equal, has_nc_time_axis, raises_regex, + requires_cftime, requires_matplotlib, requires_matplotlib2, + requires_nc_time_axis, requires_seaborn) # import mpl and change the backend before other mpl imports try: @@ -210,7 +210,8 @@ def test_2d_coords_line_plot(self): hdl = da.plot.line(x='lon', hue='y') assert len(hdl) == 4 - with pytest.raises(ValueError, message='If x or y are 2D '): + with pytest.raises( + ValueError, match="For 2D inputs, hue must be a dimension"): da.plot.line(x='lon', hue='lat') def test_2d_before_squeeze(self): @@ -537,25 +538,6 @@ def test_cmap_sequential_option(self): cmap_params = _determine_cmap_params(self.data) assert cmap_params['cmap'] == 'magma' - def test_do_nothing_if_provided_cmap(self): - cmap_list = [ - mpl.colors.LinearSegmentedColormap.from_list('name', ['r', 'g']), - mpl.colors.ListedColormap(['r', 'g', 'b']) - ] - - # can't parametrize with mpl objects when mpl is absent - for cmap in cmap_list: - cmap_params = _determine_cmap_params(self.data, - cmap=cmap, - levels=7) - assert cmap_params['cmap'] is cmap - - def test_do_something_if_provided_str_cmap(self): - cmap = 'RdBu_r' - cmap_params = _determine_cmap_params(self.data, cmap=cmap, levels=7) - assert cmap_params['cmap'] is not cmap - assert isinstance(cmap_params['cmap'], mpl.colors.ListedColormap) - def test_cmap_sequential_explicit_option(self): with xr.set_options(cmap_sequential=mpl.cm.magma): cmap_params = _determine_cmap_params(self.data) @@ -775,13 +757,14 @@ def test_discrete_colormap_list_of_levels(self): @pytest.mark.slow def test_discrete_colormap_int_levels(self): - for extend, levels, vmin, vmax in [('neither', 7, None, None), - ('neither', 7, None, 20), - ('both', 7, 4, 8), - ('min', 10, 4, 15)]: + for extend, levels, vmin, vmax, cmap in [ + ('neither', 7, None, None, None), + ('neither', 7, None, 20, mpl.cm.RdBu), + ('both', 7, 4, 8, None), + ('min', 10, 4, 15, None)]: for kind in ['imshow', 'pcolormesh', 'contourf', 'contour']: primitive = getattr(self.darray.plot, kind)( - levels=levels, vmin=vmin, vmax=vmax) + levels=levels, vmin=vmin, vmax=vmax, cmap=cmap) assert levels >= \ len(primitive.norm.boundaries) - 1 if vmax is None: @@ -1202,7 +1185,8 @@ def test_cmap_and_color_both(self): def test_2d_coord_with_interval(self): for dim in self.darray.dims: - gp = self.darray.groupby_bins(dim, range(15)).mean(dim) + gp = self.darray.groupby_bins( + dim, range(15), restore_coord_dims=True).mean(dim) for kind in ['imshow', 'pcolormesh', 'contourf', 'contour']: getattr(gp.plot, kind)() diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 4ddd114d767..5da83880539 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1540,6 +1540,42 @@ def test_reduce_funcs(self): assert_identical( v.max(), Variable([], pd.Timestamp('2000-01-03'))) + def test_reduce_keepdims(self): + v = Variable(['x', 'y'], self.d) + + assert_identical(v.mean(keepdims=True), + Variable(v.dims, np.mean(self.d, keepdims=True))) + assert_identical(v.mean(dim='x', keepdims=True), + Variable(v.dims, np.mean(self.d, axis=0, + keepdims=True))) + assert_identical(v.mean(dim='y', keepdims=True), + Variable(v.dims, np.mean(self.d, axis=1, + keepdims=True))) + assert_identical(v.mean(dim=['y', 'x'], keepdims=True), + Variable(v.dims, np.mean(self.d, axis=(1, 0), + keepdims=True))) + + v = Variable([], 1.0) + assert_identical(v.mean(keepdims=True), + Variable([], np.mean(v.data, keepdims=True))) + + @requires_dask + def test_reduce_keepdims_dask(self): + import dask.array + v = Variable(['x', 'y'], self.d).chunk() + + actual = v.mean(keepdims=True) + assert isinstance(actual.data, dask.array.Array) + + expected = Variable(v.dims, np.mean(self.d, keepdims=True)) + assert_identical(actual, expected) + + actual = v.mean(dim='y', keepdims=True) + assert isinstance(actual.data, dask.array.Array) + + expected = Variable(v.dims, np.mean(self.d, axis=1, keepdims=True)) + assert_identical(actual, expected) + def test_reduce_keep_attrs(self): _attrs = {'units': 'test', 'long_name': 'testing'} diff --git a/xarray/tutorial.py b/xarray/tutorial.py index f54cf7b3889..01d4f181d7f 100644 --- a/xarray/tutorial.py +++ b/xarray/tutorial.py @@ -7,7 +7,6 @@ ''' import hashlib import os as _os -import warnings from urllib.request import urlretrieve from .backends.api import open_dataset as _open_dataset @@ -27,7 +26,7 @@ def open_dataset(name, cache=True, cache_dir=_default_cache_dir, github_url='https://github.com/pydata/xarray-data', branch='master', **kws): """ - Load a dataset from the online repository (requires internet). + Open a dataset from the online repository (requires internet). If a local copy is found then always use that to avoid network traffic. @@ -91,17 +90,12 @@ def open_dataset(name, cache=True, cache_dir=_default_cache_dir, def load_dataset(*args, **kwargs): """ - `load_dataset` will be removed a future version of xarray. The current - behavior of this function can be achived by using - `tutorial.open_dataset(...).load()`. + Open, load into memory, and close a dataset from the online repository + (requires internet). See Also -------- open_dataset """ - warnings.warn( - "load_dataset` will be removed in a future version of xarray. The " - "current behavior of this function can be achived by using " - "`tutorial.open_dataset(...).load()`.", - DeprecationWarning, stacklevel=2) - return open_dataset(*args, **kwargs).load() + with open_dataset(*args, **kwargs) as ds: + return ds.load() diff --git a/xarray/util/print_versions.py b/xarray/util/print_versions.py index 50389df85cb..c34faa7487b 100755 --- a/xarray/util/print_versions.py +++ b/xarray/util/print_versions.py @@ -108,6 +108,7 @@ def show_versions(as_json=False): ("matplotlib", lambda mod: mod.__version__), ("cartopy", lambda mod: mod.__version__), ("seaborn", lambda mod: mod.__version__), + ("numbagg", lambda mod: mod.__version__), # xarray setup/test ("setuptools", lambda mod: mod.__version__), ("pip", lambda mod: mod.__version__),