From ec01625c88e8356460983c3be639571a23eaae61 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 15 Jul 2019 00:27:44 -0700 Subject: [PATCH 1/3] Internal clean-up of isnull() to avoid relying on pandas This version should be much more compatible out of the box with duck typing. --- properties/test_encode_decode.py | 2 -- xarray/core/duck_array_ops.py | 50 +++++++++++++++++++++++------ xarray/tests/test_duck_array_ops.py | 25 ++++++++++++--- 3 files changed, 61 insertions(+), 16 deletions(-) diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py index 13f63f259cf..4b9aa8928b4 100644 --- a/properties/test_encode_decode.py +++ b/properties/test_encode_decode.py @@ -4,8 +4,6 @@ These ones pass, just as you'd hope! """ -from __future__ import absolute_import, division, print_function - import hypothesis.extra.numpy as npst import hypothesis.strategies as st from hypothesis import given, settings diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index c4db95cfd4e..9f0abae3871 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -63,19 +63,49 @@ def fail_on_dask_array_input(values, msg=None, func_name=None): around = _dask_or_eager_func('around') isclose = _dask_or_eager_func('isclose') -notnull = _dask_or_eager_func('notnull', eager_module=pd) -_isnull = _dask_or_eager_func('isnull', eager_module=pd) + +if hasattr(np, 'isnat'): + # numpy 1.13 + isnat = _dask_or_eager_func('isnat') +else: + isnat = _dask_or_eager_func('isnull', eager_module=pd) +isnan = _dask_or_eager_func('isnan') +zeros_like = _dask_or_eager_func('zeros_like') + + +pandas_isnull = _dask_or_eager_func('isnull', eager_module=pd) def isnull(data): - # GH837, GH861 - # isnull fcn from pandas will throw TypeError when run on numpy structured - # array therefore for dims that are np structured arrays we assume all - # data is present - try: - return _isnull(data) - except TypeError: - return np.zeros(data.shape, dtype=bool) + data = asarray(data) + scalar_type = data.dtype.type + if issubclass(scalar_type, (np.datetime64, np.timedelta64)): + # datetime types use NaT for null + # note: must check timedelta64 before integers, because currently + # timedelta64 inherits from np.integer + return isnat(data) + elif issubclass(scalar_type, np.inexact): + # float types use NaN for null + return isnan(data) + elif issubclass( + scalar_type, (np.bool_, np.integer, np.character, np.void) + ): + # these types cannot represent missing values + return zeros_like(data, dtype=bool) + else: + # at this point, array should have dtype=object + if isinstance(data, (np.ndarray, dask_array_type)): + return pandas_isnull(data) + else: + # Not reachable yet, but intended for use with other duck array + # types. For full consistency with pandas, we should accept None as + # a null value as well as NaN, but it isn't clear how to do this + # with duck typing. + return data != data + + +def notnull(data): + return ~isnull(data) transpose = _dask_or_eager_func('transpose') diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 87a7a2863d3..f227b8f55ba 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -178,14 +178,18 @@ def test_wrong_shape(self): assert not array_notnull_equiv(a, b) @pytest.mark.parametrize("val1, val2, val3, null", [ - (1, 2, 3, None), + (np.datetime64('2000'), + np.datetime64('2001'), + np.datetime64('2002'), + np.datetime64('NaT')), (1., 2., 3., np.nan), - (1., 2., 3., None), ('foo', 'bar', 'baz', None), + ('foo', 'bar', 'baz', np.nan), ]) def test_types(self, val1, val2, val3, null): - arr1 = np.array([val1, null, val3, null]) - arr2 = np.array([val1, val2, null, null]) + dtype = object if isinstance(val1, str) else None + arr1 = np.array([val1, null, val3, null], dtype=dtype) + arr2 = np.array([val1, val2, null, null], dtype=dtype) assert array_notnull_equiv(arr1, arr2) @@ -431,6 +435,19 @@ def test_argmin_max_error(): da.argmin(dim='y') +@pytest.mark.parametrize('array', [ + np.array([np.datetime64('2000-01-01'), np.datetime64('NaT')]), + np.array([np.timedelta64(1, 'h'), np.timedelta64('NaT')]), + np.array([0.0, np.nan]), + np.array([1j, np.nan]), + np.array(['foo', np.nan], dtype=object), +]) +def test_isnull(array): + expected = np.array([False, True]) + actual = duck_array_ops.isnull(array) + np.testing.assert_equal(expected, actual) + + @requires_dask def test_isnull_with_dask(): da = construct_dataarray(2, np.float32, contains_nan=True, dask=True) From fcfd4171d91c13fe86856cfc396ca623e1c31626 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 4 Aug 2019 14:33:31 -0700 Subject: [PATCH 2/3] Use isnat ufunc --- xarray/core/duck_array_ops.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 9f0abae3871..a60227265db 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -64,9 +64,10 @@ def fail_on_dask_array_input(values, msg=None, func_name=None): around = _dask_or_eager_func('around') isclose = _dask_or_eager_func('isclose') -if hasattr(np, 'isnat'): - # numpy 1.13 - isnat = _dask_or_eager_func('isnat') +if hasattr(np, 'isnat') and ( + dask_array is None or hasattr(dask_array_type, '__array_ufunc__')): + # can use the isnat ufunc + isnat = np.isnat else: isnat = _dask_or_eager_func('isnull', eager_module=pd) isnan = _dask_or_eager_func('isnan') From 0818c3af81507fed5fa9f8fbc83a7c29fb094219 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 4 Aug 2019 14:58:05 -0700 Subject: [PATCH 3/3] update comment --- xarray/core/duck_array_ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index a60227265db..ac204df568f 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -66,7 +66,8 @@ def fail_on_dask_array_input(values, msg=None, func_name=None): if hasattr(np, 'isnat') and ( dask_array is None or hasattr(dask_array_type, '__array_ufunc__')): - # can use the isnat ufunc + # np.isnat is available since NumPy 1.13, so __array_ufunc__ is always + # supported. isnat = np.isnat else: isnat = _dask_or_eager_func('isnull', eager_module=pd)