From be333e4f1ae6dffcf77c9def8753899e465eca4e Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Fri, 15 Nov 2019 12:16:42 +0000 Subject: [PATCH 1/6] Closes #3409 --- ci/azure/install.yml | 2 +- doc/whats-new.rst | 7 ++++++- xarray/core/duck_array_ops.py | 35 +++++++++++++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/ci/azure/install.yml b/ci/azure/install.yml index fee886ba804..baa69bcc8d5 100644 --- a/ci/azure/install.yml +++ b/ci/azure/install.yml @@ -16,9 +16,9 @@ steps: --pre \ --upgrade \ matplotlib \ + numpy \ pandas \ scipy - # numpy \ # FIXME https://github.com/pydata/xarray/issues/3409 pip install \ --no-deps \ --upgrade \ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index abd94779435..c10d49537dd 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -100,6 +100,12 @@ Bug fixes (:issue:`3402`). By `Deepak Cherian `_ - Allow appending datetime and bool data variables to zarr stores. (:issue:`3480`). By `Akihiro Matsukawa `_. +- Add support for numpy >=1.18 (:issue:`3409`). + By `Guido Imperiale `_. +- Add support for pandas >=0.26 (:issue:`3440`). + By `Deepak Cherian `_. +- Add support for pseudonetcdf >=3.1 (:pull:`3485`). + By `Barron Henderson `_. Documentation ~~~~~~~~~~~~~ @@ -118,7 +124,6 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ - - Added integration tests against `pint `_. (:pull:`3238`, :pull:`3447`, :pull:`3493`, :pull:`3508`) by `Justus Magin `_. diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 71e79335c3d..144a00bf072 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd +from distutils.version import LooseVersion from . import dask_array_ops, dtypes, npcompat, nputils from .nputils import nanfirst, nanlast @@ -351,6 +352,32 @@ def f(values, axis=None, skipna=None, **kwargs): _mean = _create_nan_agg_method("mean") +def _datetime_crude_nanmin(array): + """Implement nanmin for datetime64 arrays, with caveats: + + - can't accept an axis parameter + - will return incorrect results if the array exclusively contains NaT + """ + if LooseVersion(np.__version__) < "1.18": + # numpy.min < 1.18 incorrectly skips NaT - which we exploit here + return min(array, skipna=False) + # This requires numpy >= 1.15 + + from .dataarray import DataArray + from .variable import Variable + + if isinstance(array, (DataArray, Variable)): + array = array.data + array = array.ravel() + array = array[~pandas_isnull(array)] + + assert array.dtype.kind in "Mm" + assert array.dtype.itemsize == 8 + initial = np.array(2 ** 63 - 1, dtype=array.dtype) + + return min(array, initial=initial, skipna=False) + + def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): """Convert an array containing datetime-like data to an array of floats. @@ -370,7 +397,10 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): """ # TODO: make this function dask-compatible? if offset is None: - offset = array.min() + if array.dtype.kind in "Mm": + offset = _datetime_crude_nanmin(array) + else: + offset = min(array) array = array - offset if not hasattr(array, "dtype"): # scalar is converted to 0d-array @@ -401,7 +431,8 @@ def mean(array, axis=None, skipna=None, **kwargs): array = asarray(array) if array.dtype.kind in "Mm": - offset = min(array) + offset = _datetime_crude_nanmin(array) + # xarray always uses np.datetime64[ns] for np.datetime64 data dtype = "timedelta64[ns]" return ( From 3daf5222b86cc6ed85c18f076d644fc614fa3a2f Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Fri, 15 Nov 2019 12:18:29 +0000 Subject: [PATCH 2/6] Unpin versions --- ci/requirements/py36.yml | 2 +- ci/requirements/py37.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/requirements/py36.yml b/ci/requirements/py36.yml index 10fe69253e8..820160b19cc 100644 --- a/ci/requirements/py36.yml +++ b/ci/requirements/py36.yml @@ -25,7 +25,7 @@ dependencies: - nc-time-axis - netcdf4 - numba - - numpy<1.18 # FIXME https://github.com/pydata/xarray/issues/3409 + - numpy - pandas - pint - pip diff --git a/ci/requirements/py37.yml b/ci/requirements/py37.yml index 827c664a222..4a7aaf7d32b 100644 --- a/ci/requirements/py37.yml +++ b/ci/requirements/py37.yml @@ -25,7 +25,7 @@ dependencies: - nc-time-axis - netcdf4 - numba - - numpy<1.18 # FIXME https://github.com/pydata/xarray/issues/3409 + - numpy - pandas - pint - pip From 0e390ea4e029c887889c4e938d7a49ad878ff256 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Fri, 15 Nov 2019 18:47:13 +0000 Subject: [PATCH 3/6] Rewrite unit test for clarity about its real scope --- xarray/core/duck_array_ops.py | 3 +-- xarray/tests/test_duck_array_ops.py | 39 ++++++++++++++++------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 144a00bf072..a1336d46ec5 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -357,6 +357,7 @@ def _datetime_crude_nanmin(array): - can't accept an axis parameter - will return incorrect results if the array exclusively contains NaT + - doesn't work with dask! """ if LooseVersion(np.__version__) < "1.18": # numpy.min < 1.18 incorrectly skips NaT - which we exploit here @@ -371,10 +372,8 @@ def _datetime_crude_nanmin(array): array = array.ravel() array = array[~pandas_isnull(array)] - assert array.dtype.kind in "Mm" assert array.dtype.itemsize == 8 initial = np.array(2 ** 63 - 1, dtype=array.dtype) - return min(array, initial=initial, skipna=False) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index f678af2fec5..43607225c0f 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -274,23 +274,28 @@ def assert_dask_array(da, dask): @arm_xfail -@pytest.mark.parametrize("dask", [False, True]) -def test_datetime_reduce(dask): - time = np.array(pd.date_range("15/12/1999", periods=11)) - time[8:11] = np.nan - da = DataArray(np.linspace(0, 365, num=11), dims="time", coords={"time": time}) - - if dask and has_dask: - chunks = {"time": 5} - da = da.chunk(chunks) - - actual = da["time"].mean() - assert not pd.isnull(actual) - actual = da["time"].mean(skipna=False) - assert pd.isnull(actual) - - # test for a 0d array - assert da["time"][0].mean() == da["time"][:1].mean() +def test_datetime_mean(): + # Note: only testing numpy, as dask is broken upstream + da = DataArray( + np.array(["2010-01-01", "NaT", "2010-01-03"], dtype="M8"), dims=["time"] + ) + expect = DataArray(np.array("2010-01-02", dtype="M8")) + expect_nat = DataArray(np.array("NaT", dtype="M8")) + + actual = da.mean() + assert_equal(actual, expect) + actual = da.mean(skipna=False) + assert_equal(actual, expect_nat) + + # tests for 1d array full of NaT + assert_equal(da[[1]].mean(), expect_nat) + assert_equal(da[[1]].mean(skipna=False), expect_nat) + + # tests for a 0d array + assert_equal(da[0].mean(), da[0]) + assert_equal(da[0].mean(skipna=False), da[0]) + assert_equal(da[1].mean(), expect_nat) + assert_equal(da[1].mean(skipna=False), expect_nat) @requires_cftime From 341fad3f84e5625629123fbb7cfd96b9c2d37d1d Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Mon, 18 Nov 2019 10:30:15 +0000 Subject: [PATCH 4/6] mean() on dask --- doc/whats-new.rst | 4 ++-- xarray/core/duck_array_ops.py | 35 +++++++++++++++-------------- xarray/tests/test_duck_array_ops.py | 15 +++++++++++-- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index bdd6d88f1f5..0c929b5b711 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -115,8 +115,8 @@ Bug fixes (:issue:`3402`). By `Deepak Cherian `_ - Allow appending datetime and bool data variables to zarr stores. (:issue:`3480`). By `Akihiro Matsukawa `_. -- Add support for numpy >=1.18 (:issue:`3409`). - By `Guido Imperiale `_. +- Add support for numpy >=1.18 (); bugfix mean() on datetime64 arrays on dask backend + (:issue:`3409`, :pull:`3537`). By `Guido Imperiale `_. - Add support for pandas >=0.26 (:issue:`3440`). By `Deepak Cherian `_. - Add support for pseudonetcdf >=3.1 (:pull:`3485`). diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index a1336d46ec5..8d0281e43c1 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -352,29 +352,30 @@ def f(values, axis=None, skipna=None, **kwargs): _mean = _create_nan_agg_method("mean") -def _datetime_crude_nanmin(array): - """Implement nanmin for datetime64 arrays, with caveats: +def _datetime_nanmin(array): + """nanmin() function for datetime64. - - can't accept an axis parameter - - will return incorrect results if the array exclusively contains NaT - - doesn't work with dask! - """ - if LooseVersion(np.__version__) < "1.18": - # numpy.min < 1.18 incorrectly skips NaT - which we exploit here - return min(array, skipna=False) - # This requires numpy >= 1.15 + Caveats that this function deals with: + - In numpy < 1.18, min() on datetime64 incorrectly ignores NaT + - numpy nanmin() don't work on datetime64 (all versions at the moment of writing) + - dask min() does not work on datetime64 (all versions at the moment of writing) + """ from .dataarray import DataArray from .variable import Variable if isinstance(array, (DataArray, Variable)): array = array.data - array = array.ravel() - array = array[~pandas_isnull(array)] - assert array.dtype.itemsize == 8 - initial = np.array(2 ** 63 - 1, dtype=array.dtype) - return min(array, initial=initial, skipna=False) + assert array.dtype.kind in "mM" + dtype = array.dtype + # (NaT).astype(float) does not produce NaN... + array = where(pandas_isnull(array), np.nan, array.astype(float)) + array = min(array, skipna=True) + if isinstance(array, float): + array = np.array(array) + # ...but (NaN).astype("M8") does produce NaT + return array.astype(dtype) def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): @@ -397,7 +398,7 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): # TODO: make this function dask-compatible? if offset is None: if array.dtype.kind in "Mm": - offset = _datetime_crude_nanmin(array) + offset = _datetime_nanmin(array) else: offset = min(array) array = array - offset @@ -430,7 +431,7 @@ def mean(array, axis=None, skipna=None, **kwargs): array = asarray(array) if array.dtype.kind in "Mm": - offset = _datetime_crude_nanmin(array) + offset = _datetime_nanmin(array) # xarray always uses np.datetime64[ns] for np.datetime64 data dtype = "timedelta64[ns]" diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 43607225c0f..041c72e6ea4 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -274,17 +274,28 @@ def assert_dask_array(da, dask): @arm_xfail -def test_datetime_mean(): +@pytest.mark.parametrize("dask", [False, True]) +def test_datetime_mean(dask): # Note: only testing numpy, as dask is broken upstream da = DataArray( - np.array(["2010-01-01", "NaT", "2010-01-03"], dtype="M8"), dims=["time"] + np.array(["2010-01-01", "NaT", "2010-01-03", "NaT", "NaT"], dtype="M8"), + dims=["time"], ) + if dask and has_dask: + # Trigger use case where a chunk is full of NaT + da = da.chunk({"time": 3}) + expect = DataArray(np.array("2010-01-02", dtype="M8")) expect_nat = DataArray(np.array("NaT", dtype="M8")) actual = da.mean() + if dask: + assert actual.chunks is not None assert_equal(actual, expect) + actual = da.mean(skipna=False) + if dask: + assert actual.chunks is not None assert_equal(actual, expect_nat) # tests for 1d array full of NaT From 5c7c271a67b8763f2eeb38d7f5fc7e2540260d69 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Mon, 18 Nov 2019 11:22:22 +0000 Subject: [PATCH 5/6] Trivial --- xarray/core/duck_array_ops.py | 1 - xarray/tests/test_duck_array_ops.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 8d0281e43c1..9328d23d00c 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -10,7 +10,6 @@ import numpy as np import pandas as pd -from distutils.version import LooseVersion from . import dask_array_ops, dtypes, npcompat, nputils from .nputils import nanfirst, nanlast diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 041c72e6ea4..aee7bbd6b11 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -274,14 +274,14 @@ def assert_dask_array(da, dask): @arm_xfail -@pytest.mark.parametrize("dask", [False, True]) +@pytest.mark.parametrize("dask", [False, True] if has_dask else [False]) def test_datetime_mean(dask): # Note: only testing numpy, as dask is broken upstream da = DataArray( np.array(["2010-01-01", "NaT", "2010-01-03", "NaT", "NaT"], dtype="M8"), dims=["time"], ) - if dask and has_dask: + if dask: # Trigger use case where a chunk is full of NaT da = da.chunk({"time": 3}) From 7767fec9e5cd1e3478c2d153e4e86da30ddfb630 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Tue, 19 Nov 2019 09:22:39 +0000 Subject: [PATCH 6/6] duck_array_ops should never receive xarray.Variable --- xarray/core/dataset.py | 4 +++- xarray/core/duck_array_ops.py | 6 ------ xarray/tests/test_dataset.py | 4 +++- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5de254614ff..c631a4c11ea 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5316,7 +5316,9 @@ def _integrate_one(self, coord, datetime_unit=None): datetime_unit, _ = np.datetime_data(coord_var.dtype) elif datetime_unit is None: datetime_unit = "s" # Default to seconds for cftime objects - coord_var = datetime_to_numeric(coord_var, datetime_unit=datetime_unit) + coord_var = coord_var._replace( + data=datetime_to_numeric(coord_var.data, datetime_unit=datetime_unit) + ) variables = {} coord_names = set() diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 9328d23d00c..cf616acb485 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -360,12 +360,6 @@ def _datetime_nanmin(array): - numpy nanmin() don't work on datetime64 (all versions at the moment of writing) - dask min() does not work on datetime64 (all versions at the moment of writing) """ - from .dataarray import DataArray - from .variable import Variable - - if isinstance(array, (DataArray, Variable)): - array = array.data - assert array.dtype.kind in "mM" dtype = array.dtype # (NaT).astype(float) does not produce NaN... diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index be40ce7c6e8..de074da541f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5874,7 +5874,9 @@ def test_trapz_datetime(dask, which_datetime): actual = da.integrate("time", datetime_unit="D") expected_data = np.trapz( - da, duck_array_ops.datetime_to_numeric(da["time"], datetime_unit="D"), axis=0 + da.data, + duck_array_ops.datetime_to_numeric(da["time"].data, datetime_unit="D"), + axis=0, ) expected = xr.DataArray( expected_data,