Skip to content

Commit

Permalink
xr.cov() and xr.corr() (#4089)
Browse files Browse the repository at this point in the history
* Added chunks='auto' option in dataset.py

* reverted accidental changes in dataset.chunk()

* Added corr and cov to computation.py. Taken from r-beer:xarray/corr

* Added r-beer's tests to test_computation.py

Still issues I think

* trying to fix github.com//pull/3550#discussion_r349935731

* Removing drop=True from the `.where()` calls in `computation.py`+test.py

* api.rst and whats-new.rst

* Updated `xarray/__init__.py` and added `broadcast` import to computation

* added DataArray import to corr, cov

* assert_allclose added to test_computation.py

* removed whitespace in test_dask...oops

* Added to init

* format changes

* Fiddling around with cov/corr tests in `test_computation.py`

* PEP8 changes

* pep

* remove old todo and comments

* isort

* Added consistency check between corr() and cov(), ensure they give same

* added `skipna=False` to `computation.py`. made consistency+autocov tests

* formatting

* Added numpy-based tests.

* format

* formatting again

* Update doc/whats-new.rst

Co-authored-by: keewis <[email protected]>

* refactored corr/cov so there is one internal method for calculating both

* formatting

* updating docstrings and code suggestions from PR

* paramterize ddof in tests

* removed extraneous test arrays

* formatting + adding deterministic docstring

* added test for TypeError

* formatting

* tidying up docstring

* formatting and tidying up `_cov_corr()` so that the logic is more clear

* flake8 ...

Co-authored-by: keewis <[email protected]>
  • Loading branch information
AndrewILWilliams and keewis authored May 25, 2020
1 parent bdb1d33 commit 3194b3e
Show file tree
Hide file tree
Showing 5 changed files with 343 additions and 3 deletions.
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Top-level functions
full_like
zeros_like
ones_like
cov
corr
dot
polyval
map_blocks
Expand Down
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ Breaking changes

New Features
~~~~~~~~~~~~
- Added :py:func:`xarray.cov` and :py:func:`xarray.corr` (:issue:`3784`, :pull:`3550`, :pull:`4089`).
By `Andrew Williams <https://github.com/AndrewWilliams3142>`_ and `Robin Beer <https://github.com/r-beer>`_.
- Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`)
By `Pascal Bourgault <https://github.com/aulemahal>`_.
- Control over attributes of result in :py:func:`merge`, :py:func:`concat`,
Expand Down
4 changes: 3 additions & 1 deletion xarray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .core.alignment import align, broadcast
from .core.combine import auto_combine, combine_by_coords, combine_nested
from .core.common import ALL_DIMS, full_like, ones_like, zeros_like
from .core.computation import apply_ufunc, dot, polyval, where
from .core.computation import apply_ufunc, corr, cov, dot, polyval, where
from .core.concat import concat
from .core.dataarray import DataArray
from .core.dataset import Dataset
Expand Down Expand Up @@ -54,6 +54,8 @@
"concat",
"decode_cf",
"dot",
"cov",
"corr",
"full_like",
"load_dataarray",
"load_dataset",
Expand Down
180 changes: 179 additions & 1 deletion xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import numpy as np

from . import dtypes, duck_array_ops, utils
from .alignment import deep_align
from .alignment import align, deep_align
from .merge import merge_coordinates_without_align
from .options import OPTIONS
from .pycompat import dask_array_type
Expand Down Expand Up @@ -1069,6 +1069,184 @@ def earth_mover_distance(first_samples,
return apply_array_ufunc(func, *args, dask=dask)


def cov(da_a, da_b, dim=None, ddof=1):
"""
Compute covariance between two DataArray objects along a shared dimension.
Parameters
----------
da_a: DataArray object
Array to compute.
da_b: DataArray object
Array to compute.
dim : str, optional
The dimension along which the covariance will be computed
ddof: int, optional
If ddof=1, covariance is normalized by N-1, giving an unbiased estimate,
else normalization is by N.
Returns
-------
covariance: DataArray
See also
--------
pandas.Series.cov: corresponding pandas function
xr.corr: respective function to calculate correlation
Examples
--------
>>> da_a = DataArray(np.array([[1, 2, 3], [0.1, 0.2, 0.3], [3.2, 0.6, 1.8]]),
... dims=("space", "time"),
... coords=[('space', ['IA', 'IL', 'IN']),
... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))])
>>> da_a
<xarray.DataArray (space: 3, time: 3)>
array([[1. , 2. , 3. ],
[0.1, 0.2, 0.3],
[3.2, 0.6, 1.8]])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03
>>> da_a = DataArray(np.array([[0.2, 0.4, 0.6], [15, 10, 5], [3.2, 0.6, 1.8]]),
... dims=("space", "time"),
... coords=[('space', ['IA', 'IL', 'IN']),
... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))])
>>> da_b
<xarray.DataArray (space: 3, time: 3)>
array([[ 0.2, 0.4, 0.6],
[15. , 10. , 5. ],
[ 3.2, 0.6, 1.8]])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03
>>> xr.cov(da_a, da_b)
<xarray.DataArray ()>
array(-3.53055556)
>>> xr.cov(da_a, da_b, dim='time')
<xarray.DataArray (space: 3)>
array([ 0.2, -0.5, 1.69333333])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
"""
from .dataarray import DataArray

if any(not isinstance(arr, DataArray) for arr in [da_a, da_b]):
raise TypeError(
"Only xr.DataArray is supported."
"Given {}.".format([type(arr) for arr in [da_a, da_b]])
)

return _cov_corr(da_a, da_b, dim=dim, ddof=ddof, method="cov")


def corr(da_a, da_b, dim=None):
"""
Compute the Pearson correlation coefficient between
two DataArray objects along a shared dimension.
Parameters
----------
da_a: DataArray object
Array to compute.
da_b: DataArray object
Array to compute.
dim: str, optional
The dimension along which the correlation will be computed
Returns
-------
correlation: DataArray
See also
--------
pandas.Series.corr: corresponding pandas function
xr.cov: underlying covariance function
Examples
--------
>>> da_a = DataArray(np.array([[1, 2, 3], [0.1, 0.2, 0.3], [3.2, 0.6, 1.8]]),
... dims=("space", "time"),
... coords=[('space', ['IA', 'IL', 'IN']),
... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))])
>>> da_a
<xarray.DataArray (space: 3, time: 3)>
array([[1. , 2. , 3. ],
[0.1, 0.2, 0.3],
[3.2, 0.6, 1.8]])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03
>>> da_a = DataArray(np.array([[0.2, 0.4, 0.6], [15, 10, 5], [3.2, 0.6, 1.8]]),
... dims=("space", "time"),
... coords=[('space', ['IA', 'IL', 'IN']),
... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))])
>>> da_b
<xarray.DataArray (space: 3, time: 3)>
array([[ 0.2, 0.4, 0.6],
[15. , 10. , 5. ],
[ 3.2, 0.6, 1.8]])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03
>>> xr.corr(da_a, da_b)
<xarray.DataArray ()>
array(-0.57087777)
>>> xr.corr(da_a, da_b, dim='time')
<xarray.DataArray (space: 3)>
array([ 1., -1., 1.])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
"""
from .dataarray import DataArray

if any(not isinstance(arr, DataArray) for arr in [da_a, da_b]):
raise TypeError(
"Only xr.DataArray is supported."
"Given {}.".format([type(arr) for arr in [da_a, da_b]])
)

return _cov_corr(da_a, da_b, dim=dim, method="corr")


def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None):
"""
Internal method for xr.cov() and xr.corr() so only have to
sanitize the input arrays once and we don't repeat code.
"""
# 1. Broadcast the two arrays
da_a, da_b = align(da_a, da_b, join="inner", copy=False)

# 2. Ignore the nans
valid_values = da_a.notnull() & da_b.notnull()

if not valid_values.all():
da_a = da_a.where(valid_values)
da_b = da_b.where(valid_values)

valid_count = valid_values.sum(dim) - ddof

# 3. Detrend along the given dim
demeaned_da_a = da_a - da_a.mean(dim=dim)
demeaned_da_b = da_b - da_b.mean(dim=dim)

# 4. Compute covariance along the given dim
# N.B. `skipna=False` is required or there is a bug when computing
# auto-covariance. E.g. Try xr.cov(da,da) for
# da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"])
cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count)

if method == "cov":
return cov

else:
# compute std + corr
da_a_std = da_a.std(dim=dim)
da_b_std = da_b.std(dim=dim)
corr = cov / (da_a_std * da_b_std)
return corr


def dot(*arrays, dims=None, **kwargs):
"""Generalized dot product for xarray objects. Like np.einsum, but
provides a simpler interface based on array dimensions.
Expand Down
Loading

0 comments on commit 3194b3e

Please sign in to comment.