diff --git a/doc/whats-new.rst b/doc/whats-new.rst index cbb63feda96..c6615278852 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,8 @@ v2023.03.0 (unreleased) New Features ~~~~~~~~~~~~ +- Fix :py:meth:`xr.cov` and :py:meth:`xr.corr` now support complex valued arrays (:issue:`7340`, :pull:`7392`). + By `Michael Niklas `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 8fe333e735f..2305e753cee 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -9,7 +9,16 @@ import warnings from collections import Counter from collections.abc import Hashable, Iterable, Mapping, Sequence -from typing import TYPE_CHECKING, AbstractSet, Any, Callable, TypeVar, Union, overload +from typing import ( + TYPE_CHECKING, + AbstractSet, + Any, + Callable, + Literal, + TypeVar, + Union, + overload, +) import numpy as np @@ -21,7 +30,7 @@ from xarray.core.merge import merge_attrs, merge_coordinates_without_align from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.pycompat import is_duck_dask_array -from xarray.core.types import T_DataArray +from xarray.core.types import Dims, T_DataArray from xarray.core.utils import is_dict_like, is_scalar from xarray.core.variable import Variable @@ -1209,7 +1218,9 @@ def apply_ufunc( return apply_array_ufunc(func, *args, dask=dask) -def cov(da_a, da_b, dim=None, ddof=1): +def cov( + da_a: T_DataArray, da_b: T_DataArray, dim: Dims = None, ddof: int = 1 +) -> T_DataArray: """ Compute covariance between two DataArray objects along a shared dimension. @@ -1219,9 +1230,9 @@ def cov(da_a, da_b, dim=None, ddof=1): Array to compute. da_b : DataArray Array to compute. - dim : str, optional + dim : str, iterable of hashable, "..." or None, optional The dimension along which the covariance will be computed - ddof : int, optional + ddof : int, default: 1 If ddof=1, covariance is normalized by N-1, giving an unbiased estimate, else normalization is by N. @@ -1289,7 +1300,7 @@ def cov(da_a, da_b, dim=None, ddof=1): return _cov_corr(da_a, da_b, dim=dim, ddof=ddof, method="cov") -def corr(da_a, da_b, dim=None): +def corr(da_a: T_DataArray, da_b: T_DataArray, dim: Dims = None) -> T_DataArray: """ Compute the Pearson correlation coefficient between two DataArray objects along a shared dimension. @@ -1300,7 +1311,7 @@ def corr(da_a, da_b, dim=None): Array to compute. da_b : DataArray Array to compute. - dim : str, optional + dim : str, iterable of hashable, "..." or None, optional The dimension along which the correlation will be computed Returns @@ -1368,7 +1379,11 @@ def corr(da_a, da_b, dim=None): def _cov_corr( - da_a: T_DataArray, da_b: T_DataArray, dim=None, ddof=0, method=None + da_a: T_DataArray, + da_b: T_DataArray, + dim: Dims = None, + ddof: int = 0, + method: Literal["cov", "corr", None] = None, ) -> T_DataArray: """ Internal method for xr.cov() and xr.corr() so only have to @@ -1388,22 +1403,21 @@ def _cov_corr( demeaned_da_b = da_b - da_b.mean(dim=dim) # 4. Compute covariance along the given dim - # # N.B. `skipna=True` is required or auto-covariance is computed incorrectly. E.g. # Try xr.cov(da,da) for da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]) - cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=True, min_count=1) / ( - valid_count - ) + cov = (demeaned_da_a.conj() * demeaned_da_b).sum( + dim=dim, skipna=True, min_count=1 + ) / (valid_count) if method == "cov": - return cov + return cov # type: ignore[return-value] else: # compute std + corr da_a_std = da_a.std(dim=dim) da_b_std = da_b.std(dim=dim) corr = cov / (da_a_std * da_b_std) - return corr + return corr # type: ignore[return-value] def cross( @@ -1616,7 +1630,7 @@ def cross( def dot( *arrays, - dims: str | Iterable[Hashable] | ellipsis | None = None, + dims: Dims = None, **kwargs: Any, ): """Generalized dot product for xarray objects. Like np.einsum, but @@ -1626,7 +1640,7 @@ def dot( ---------- *arrays : DataArray or Variable Arrays to compute. - dims : ..., str or tuple of str, optional + dims : str, iterable of hashable, "..." or None, optional Which dimensions to sum over. Ellipsis ('...') sums over all dimensions. If not specified, then all the common dimensions are summed over. **kwargs : dict diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index c1be98670cb..8e45a07d6a2 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1387,17 +1387,18 @@ def test_vectorize_exclude_dims_dask() -> None: def test_corr_only_dataarray() -> None: with pytest.raises(TypeError, match="Only xr.DataArray is supported"): - xr.corr(xr.Dataset(), xr.Dataset()) + xr.corr(xr.Dataset(), xr.Dataset()) # type: ignore[type-var] -def arrays_w_tuples(): +@pytest.fixture(scope="module") +def arrays(): da = xr.DataArray( np.random.random((3, 21, 4)), coords={"time": pd.date_range("2000-01-01", freq="1D", periods=21)}, dims=("a", "time", "x"), ) - arrays = [ + return [ da.isel(time=range(0, 18)), da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(), xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]), @@ -1405,7 +1406,10 @@ def arrays_w_tuples(): xr.DataArray([[1, 2], [2, 1]], dims=["x", "time"]), ] - array_tuples = [ + +@pytest.fixture(scope="module") +def array_tuples(arrays): + return [ (arrays[0], arrays[0]), (arrays[0], arrays[1]), (arrays[1], arrays[1]), @@ -1417,27 +1421,19 @@ def arrays_w_tuples(): (arrays[4], arrays[4]), ] - return arrays, array_tuples - @pytest.mark.parametrize("ddof", [0, 1]) -@pytest.mark.parametrize( - "da_a, da_b", - [ - arrays_w_tuples()[1][3], - arrays_w_tuples()[1][4], - arrays_w_tuples()[1][5], - arrays_w_tuples()[1][6], - arrays_w_tuples()[1][7], - arrays_w_tuples()[1][8], - ], -) +@pytest.mark.parametrize("n", [3, 4, 5, 6, 7, 8]) @pytest.mark.parametrize("dim", [None, "x", "time"]) @requires_dask -def test_lazy_corrcov(da_a, da_b, dim, ddof) -> None: +def test_lazy_corrcov( + n: int, dim: str | None, ddof: int, array_tuples: tuple[xr.DataArray, xr.DataArray] +) -> None: # GH 5284 from dask import is_dask_collection + da_a, da_b = array_tuples[n] + with raise_if_dask_computes(): cov = xr.cov(da_a.chunk(), da_b.chunk(), dim=dim, ddof=ddof) assert is_dask_collection(cov) @@ -1447,12 +1443,13 @@ def test_lazy_corrcov(da_a, da_b, dim, ddof) -> None: @pytest.mark.parametrize("ddof", [0, 1]) -@pytest.mark.parametrize( - "da_a, da_b", - [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2]], -) +@pytest.mark.parametrize("n", [0, 1, 2]) @pytest.mark.parametrize("dim", [None, "time"]) -def test_cov(da_a, da_b, dim, ddof) -> None: +def test_cov( + n: int, dim: str | None, ddof: int, array_tuples: tuple[xr.DataArray, xr.DataArray] +) -> None: + da_a, da_b = array_tuples[n] + if dim is not None: def np_cov_ind(ts1, ts2, a, x): @@ -1499,12 +1496,13 @@ def np_cov(ts1, ts2): assert_allclose(actual, expected) -@pytest.mark.parametrize( - "da_a, da_b", - [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2]], -) +@pytest.mark.parametrize("n", [0, 1, 2]) @pytest.mark.parametrize("dim", [None, "time"]) -def test_corr(da_a, da_b, dim) -> None: +def test_corr( + n: int, dim: str | None, array_tuples: tuple[xr.DataArray, xr.DataArray] +) -> None: + da_a, da_b = array_tuples[n] + if dim is not None: def np_corr_ind(ts1, ts2, a, x): @@ -1547,12 +1545,12 @@ def np_corr(ts1, ts2): assert_allclose(actual, expected) -@pytest.mark.parametrize( - "da_a, da_b", - arrays_w_tuples()[1], -) +@pytest.mark.parametrize("n", range(9)) @pytest.mark.parametrize("dim", [None, "time", "x"]) -def test_covcorr_consistency(da_a, da_b, dim) -> None: +def test_covcorr_consistency( + n: int, dim: str | None, array_tuples: tuple[xr.DataArray, xr.DataArray] +) -> None: + da_a, da_b = array_tuples[n] # Testing that xr.corr and xr.cov are consistent with each other # 1. Broadcast the two arrays da_a, da_b = broadcast(da_a, da_b) @@ -1569,10 +1567,13 @@ def test_covcorr_consistency(da_a, da_b, dim) -> None: @requires_dask -@pytest.mark.parametrize("da_a, da_b", arrays_w_tuples()[1]) +@pytest.mark.parametrize("n", range(9)) @pytest.mark.parametrize("dim", [None, "time", "x"]) @pytest.mark.filterwarnings("ignore:invalid value encountered in .*divide") -def test_corr_lazycorr_consistency(da_a, da_b, dim) -> None: +def test_corr_lazycorr_consistency( + n: int, dim: str | None, array_tuples: tuple[xr.DataArray, xr.DataArray] +) -> None: + da_a, da_b = array_tuples[n] da_al = da_a.chunk() da_bl = da_b.chunk() c_abl = xr.corr(da_al, da_bl, dim=dim) @@ -1591,22 +1592,27 @@ def test_corr_dtype_error(): xr.testing.assert_equal(xr.corr(da_a, da_b), xr.corr(da_a, da_b.chunk())) -@pytest.mark.parametrize( - "da_a", - arrays_w_tuples()[0], -) +@pytest.mark.parametrize("n", range(5)) @pytest.mark.parametrize("dim", [None, "time", "x", ["time", "x"]]) -def test_autocov(da_a, dim) -> None: +def test_autocov(n: int, dim: str | None, arrays) -> None: + da = arrays[n] + # Testing that the autocovariance*(N-1) is ~=~ to the variance matrix # 1. Ignore the nans - valid_values = da_a.notnull() + valid_values = da.notnull() # Because we're using ddof=1, this requires > 1 value in each sample - da_a = da_a.where(valid_values.sum(dim=dim) > 1) - expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=True, min_count=1) - actual = xr.cov(da_a, da_a, dim=dim) * (valid_values.sum(dim) - 1) + da = da.where(valid_values.sum(dim=dim) > 1) + expected = ((da - da.mean(dim=dim)) ** 2).sum(dim=dim, skipna=True, min_count=1) + actual = xr.cov(da, da, dim=dim) * (valid_values.sum(dim) - 1) assert_allclose(actual, expected) +def test_complex_cov() -> None: + da = xr.DataArray([1j, -1j]) + actual = xr.cov(da, da) + assert abs(actual.item()) == 2 + + @requires_dask def test_vectorize_dask_new_output_dims() -> None: # regression test for GH3574