diff --git a/doc/computation.rst b/doc/computation.rst index 663c546be20..240a1e5704b 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -95,6 +95,9 @@ for filling missing values via 1D interpolation. Note that xarray slightly diverges from the pandas ``interpolate`` syntax by providing the ``use_coordinate`` keyword which facilitates a clear specification of which values to use as the index in the interpolation. +xarray also provides the ``max_gap`` keyword argument to limit the interpolation to +data gaps of length ``max_gap`` or smaller. See :py:meth:`~xarray.DataArray.interpolate_na` +for more. Aggregation =========== diff --git a/doc/conf.py b/doc/conf.py index 7c1557a1e66..0e04f8ccde8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -340,9 +340,10 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), - "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), - "iris": ("http://scitools.org.uk/iris/docs/latest/", None), - "numpy": ("https://docs.scipy.org/doc/numpy/", None), - "numba": ("https://numba.pydata.org/numba-doc/latest/", None), - "matplotlib": ("https://matplotlib.org/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None), + "iris": ("https://scitools.org.uk/iris/docs/latest", None), + "numpy": ("https://docs.scipy.org/doc/numpy", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "numba": ("https://numba.pydata.org/numba-doc/latest", None), + "matplotlib": ("https://matplotlib.org", None), } diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 212e465b368..50c79eec935 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,6 +38,10 @@ Breaking changes New Features ~~~~~~~~~~~~ + +- Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and + :py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data + gap that will be filled by interpolation. By `Deepak Cherian `_. - :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` have been added for dropping labels. :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` have been added for dropping variables (including coordinates). The existing ``drop`` methods remain as a backward compatible diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index a192fe08cee..5d0d6b483d9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2018,44 +2018,69 @@ def fillna(self, value: Any) -> "DataArray": def interpolate_na( self, - dim=None, + dim: Hashable = None, method: str = "linear", limit: int = None, use_coordinate: Union[bool, str] = True, + max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None, **kwargs: Any, ) -> "DataArray": - """Interpolate values according to different methods. + """Fill in NaNs by interpolating according to different methods. Parameters ---------- dim : str Specifies the dimension along which to interpolate. - method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial', 'barycentric', 'krog', 'pchip', - 'spline', 'akima'}, optional + method : str, optional String indicating which method to use for interpolation: - 'linear': linear interpolation (Default). Additional keyword - arguments are passed to ``numpy.interp`` - - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial': are passed to ``scipy.interpolate.interp1d``. If - method=='polynomial', the ``order`` keyword argument must also be + arguments are passed to :py:func:`numpy.interp` + - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be provided. - - 'barycentric', 'krog', 'pchip', 'spline', and `akima`: use their - respective``scipy.interpolate`` classes. - use_coordinate : boolean or str, default True + - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their + respective :py:class:`scipy.interpolate` classes. + use_coordinate : bool, str, default True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if - eqaully-spaced along `dim`. If True, the IndexVariable `dim` is - used. If use_coordinate is a string, it specifies the name of a + eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a coordinate variariable to use as the index. limit : int, default None Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, + see ``max_gap``. + max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None. + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + kwargs : dict, optional + parameters passed verbatim to the underlying interpolation function Returns ------- - DataArray + interpolated: DataArray + Filled in DataArray. See also -------- @@ -2070,6 +2095,7 @@ def interpolate_na( method=method, limit=limit, use_coordinate=use_coordinate, + max_gap=max_gap, **kwargs, ) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 15a7209ab24..5b337b70293 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3900,42 +3900,65 @@ def interpolate_na( method: str = "linear", limit: int = None, use_coordinate: Union[bool, Hashable] = True, + max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None, **kwargs: Any, ) -> "Dataset": - """Interpolate values according to different methods. + """Fill in NaNs by interpolating according to different methods. Parameters ---------- - dim : Hashable + dim : str Specifies the dimension along which to interpolate. - method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial', 'barycentric', 'krog', 'pchip', - 'spline'}, optional + method : str, optional String indicating which method to use for interpolation: - 'linear': linear interpolation (Default). Additional keyword - arguments are passed to ``numpy.interp`` - - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial': are passed to ``scipy.interpolate.interp1d``. If - method=='polynomial', the ``order`` keyword argument must also be + arguments are passed to :py:func:`numpy.interp` + - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be provided. - - 'barycentric', 'krog', 'pchip', 'spline': use their respective - ``scipy.interpolate`` classes. - use_coordinate : boolean or str, default True + - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their + respective :py:class:`scipy.interpolate` classes. + use_coordinate : bool, str, default True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if - eqaully-spaced along `dim`. If True, the IndexVariable `dim` is - used. If use_coordinate is a string, it specifies the name of a + eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a coordinate variariable to use as the index. limit : int, default None Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. - kwargs : any - parameters passed verbatim to the underlying interplation function + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, + see ``max_gap``. + max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None. + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + kwargs : dict, optional + parameters passed verbatim to the underlying interpolation function Returns ------- - Dataset + interpolated: Dataset + Filled in Dataset. See also -------- @@ -3951,6 +3974,7 @@ def interpolate_na( method=method, limit=limit, use_coordinate=use_coordinate, + max_gap=max_gap, **kwargs, ) return new diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 77dde66484e..117fcaf8f81 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -1,18 +1,46 @@ import warnings from functools import partial -from typing import Any, Callable, Dict, Sequence +from numbers import Number +from typing import Any, Callable, Dict, Hashable, Sequence, Union import numpy as np import pandas as pd from . import utils -from .common import _contains_datetime_like_objects +from .common import _contains_datetime_like_objects, ones_like from .computation import apply_ufunc from .duck_array_ops import dask_array_type from .utils import OrderedSet, is_scalar from .variable import Variable, broadcast_variables +def _get_nan_block_lengths(obj, dim: Hashable, index: Variable): + """ + Return an object where each NaN element in 'obj' is replaced by the + length of the gap the element is in. + """ + + # make variable so that we get broadcasting for free + index = Variable([dim], index) + + # algorithm from https://github.com/pydata/xarray/pull/3302#discussion_r324707072 + arange = ones_like(obj) * index + valid = obj.notnull() + valid_arange = arange.where(valid) + cumulative_nans = valid_arange.ffill(dim=dim).fillna(index[0]) + + nan_block_lengths = ( + cumulative_nans.diff(dim=dim, label="upper") + .reindex({dim: obj[dim]}) + .where(valid) + .bfill(dim=dim) + .where(~valid, 0) + .fillna(index[-1] - valid_arange.max()) + ) + + return nan_block_lengths + + class BaseInterpolator: """Generic interpolator class for normalizing interpolation methods """ @@ -178,7 +206,7 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs): return ds -def get_clean_interp_index(arr, dim, use_coordinate=True): +def get_clean_interp_index(arr, dim: Hashable, use_coordinate: Union[str, bool] = True): """get index to use for x values in interpolation. If use_coordinate is True, the coordinate that shares the name of the @@ -195,23 +223,33 @@ def get_clean_interp_index(arr, dim, use_coordinate=True): index = arr.coords[use_coordinate] if index.ndim != 1: raise ValueError( - "Coordinates used for interpolation must be 1D, " - "%s is %dD." % (use_coordinate, index.ndim) + f"Coordinates used for interpolation must be 1D, " + f"{use_coordinate} is {index.ndim}D." ) + index = index.to_index() + + # TODO: index.name is None for multiindexes + # set name for nice error messages below + if isinstance(index, pd.MultiIndex): + index.name = dim + + if not index.is_monotonic: + raise ValueError(f"Index {index.name!r} must be monotonically increasing") + + if not index.is_unique: + raise ValueError(f"Index {index.name!r} has duplicate values") # raise if index cannot be cast to a float (e.g. MultiIndex) try: index = index.values.astype(np.float64) except (TypeError, ValueError): # pandas raises a TypeError - # xarray/nuppy raise a ValueError + # xarray/numpy raise a ValueError raise TypeError( - "Index must be castable to float64 to support" - "interpolation, got: %s" % type(index) + f"Index {index.name!r} must be castable to float64 to support " + f"interpolation, got {type(index).__name__}." ) - # check index sorting now so we can skip it later - if not (np.diff(index) > 0).all(): - raise ValueError("Index must be monotonicly increasing") + else: axis = arr.get_axis_num(dim) index = np.arange(arr.shape[axis], dtype=np.float64) @@ -220,7 +258,13 @@ def get_clean_interp_index(arr, dim, use_coordinate=True): def interp_na( - self, dim=None, use_coordinate=True, method="linear", limit=None, **kwargs + self, + dim: Hashable = None, + use_coordinate: Union[bool, str] = True, + method: str = "linear", + limit: int = None, + max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None, + **kwargs, ): """Interpolate values according to different methods. """ @@ -230,6 +274,40 @@ def interp_na( if limit is not None: valids = _get_valid_fill_mask(self, dim, limit) + if max_gap is not None: + max_type = type(max_gap).__name__ + if not is_scalar(max_gap): + raise ValueError("max_gap must be a scalar.") + + if ( + dim in self.indexes + and isinstance(self.indexes[dim], pd.DatetimeIndex) + and use_coordinate + ): + if not isinstance(max_gap, (np.timedelta64, pd.Timedelta, str)): + raise TypeError( + f"Underlying index is DatetimeIndex. Expected max_gap of type str, pandas.Timedelta or numpy.timedelta64 but received {max_type}" + ) + + if isinstance(max_gap, str): + try: + max_gap = pd.to_timedelta(max_gap) + except ValueError: + raise ValueError( + f"Could not convert {max_gap!r} to timedelta64 using pandas.to_timedelta" + ) + + if isinstance(max_gap, pd.Timedelta): + max_gap = np.timedelta64(max_gap.value, "ns") + + max_gap = np.timedelta64(max_gap, "ns").astype(np.float64) + + if not use_coordinate: + if not isinstance(max_gap, (Number, np.number)): + raise TypeError( + f"Expected integer or floating point max_gap since use_coordinate=False. Received {max_type}." + ) + # method index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) interp_class, kwargs = _get_interpolator(method, **kwargs) @@ -253,6 +331,14 @@ def interp_na( if limit is not None: arr = arr.where(valids) + if max_gap is not None: + if dim not in self.coords: + raise NotImplementedError( + "max_gap not implemented for unlabeled coordinates yet." + ) + nan_block_lengths = _get_nan_block_lengths(self, dim, index) + arr = arr.where(nan_block_lengths <= max_gap) + return arr diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index cfce5d6f645..0b410383a34 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -5,7 +5,13 @@ import pytest import xarray as xr -from xarray.core.missing import NumpyInterpolator, ScipyInterpolator, SplineInterpolator +from xarray.core.missing import ( + NumpyInterpolator, + ScipyInterpolator, + SplineInterpolator, + get_clean_interp_index, + _get_nan_block_lengths, +) from xarray.core.pycompat import dask_array_type from xarray.tests import ( assert_array_equal, @@ -153,7 +159,7 @@ def test_interpolate_pd_compat_polynomial(): def test_interpolate_unsorted_index_raises(): vals = np.array([1, 2, 3], dtype=np.float64) expected = xr.DataArray(vals, dims="x", coords={"x": [2, 1, 3]}) - with raises_regex(ValueError, "Index must be monotonicly increasing"): + with raises_regex(ValueError, "Index 'x' must be monotonically increasing"): expected.interpolate_na(dim="x", method="index") @@ -169,12 +175,19 @@ def test_interpolate_invalid_interpolator_raises(): da.interpolate_na(dim="x", method="foo") +def test_interpolate_duplicate_values_raises(): + data = np.random.randn(2, 3) + da = xr.DataArray(data, coords=[("x", ["a", "a"]), ("y", [0, 1, 2])]) + with raises_regex(ValueError, "Index 'x' has duplicate values"): + da.interpolate_na(dim="x", method="foo") + + def test_interpolate_multiindex_raises(): data = np.random.randn(2, 3) data[1, 1] = np.nan da = xr.DataArray(data, coords=[("x", ["a", "b"]), ("y", [0, 1, 2])]) das = da.stack(z=("x", "y")) - with raises_regex(TypeError, "Index must be castable to float64"): + with raises_regex(TypeError, "Index 'z' must be castable to float64"): das.interpolate_na(dim="z") @@ -439,3 +452,114 @@ def test_ffill_dataset(ds): @requires_bottleneck def test_bfill_dataset(ds): ds.ffill(dim="time") + + +@requires_bottleneck +@pytest.mark.parametrize( + "y, lengths", + [ + [np.arange(9), [[3, 3, 3, 0, 3, 3, 0, 2, 2]]], + [np.arange(9) * 3, [[9, 9, 9, 0, 9, 9, 0, 6, 6]]], + [[0, 2, 5, 6, 7, 8, 10, 12, 14], [[6, 6, 6, 0, 4, 4, 0, 4, 4]]], + ], +) +def test_interpolate_na_nan_block_lengths(y, lengths): + arr = [[np.nan, np.nan, np.nan, 1, np.nan, np.nan, 4, np.nan, np.nan]] + da = xr.DataArray(arr * 2, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_nan_block_lengths(da, dim="y", index=index) + expected = da.copy(data=lengths * 2) + assert_equal(actual, expected) + + +@pytest.fixture +def da_time(): + return xr.DataArray( + [np.nan, 1, 2, np.nan, np.nan, 5, np.nan, np.nan, np.nan, np.nan, 10], + dims=["t"], + ) + + +def test_interpolate_na_max_gap_errors(da_time): + with raises_regex( + NotImplementedError, "max_gap not implemented for unlabeled coordinates" + ): + da_time.interpolate_na("t", max_gap=1) + + with raises_regex(ValueError, "max_gap must be a scalar."): + da_time.interpolate_na("t", max_gap=(1,)) + + da_time["t"] = pd.date_range("2001-01-01", freq="H", periods=11) + with raises_regex(TypeError, "Underlying index is"): + da_time.interpolate_na("t", max_gap=1) + + with raises_regex(TypeError, "Expected integer or floating point"): + da_time.interpolate_na("t", max_gap="1H", use_coordinate=False) + + with raises_regex(ValueError, "Could not convert 'huh' to timedelta64"): + da_time.interpolate_na("t", max_gap="huh") + + +@requires_bottleneck +@pytest.mark.parametrize( + "time_range_func", + [pd.date_range, pytest.param(xr.cftime_range, marks=pytest.mark.xfail)], +) +@pytest.mark.parametrize("transform", [lambda x: x, lambda x: x.to_dataset(name="a")]) +@pytest.mark.parametrize( + "max_gap", ["3H", np.timedelta64(3, "h"), pd.to_timedelta("3H")] +) +def test_interpolate_na_max_gap_time_specifier( + da_time, max_gap, transform, time_range_func +): + da_time["t"] = time_range_func("2001-01-01", freq="H", periods=11) + expected = transform( + da_time.copy(data=[np.nan, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan, 10]) + ) + actual = transform(da_time).interpolate_na("t", max_gap=max_gap) + assert_equal(actual, expected) + + +@requires_bottleneck +@pytest.mark.parametrize( + "coords", + [ + pytest.param(None, marks=pytest.mark.xfail()), + {"x": np.arange(4), "y": np.arange(11)}, + ], +) +def test_interpolate_na_2d(coords): + da = xr.DataArray( + [ + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + ], + dims=["x", "y"], + coords=coords, + ) + + actual = da.interpolate_na("y", max_gap=2) + expected_y = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, 5, 6, 7, np.nan, np.nan, np.nan, 11], + ] + ) + assert_equal(actual, expected_y) + + actual = da.interpolate_na("x", max_gap=3) + expected_x = xr.DataArray( + [ + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + ], + dims=["x", "y"], + coords=coords, + ) + assert_equal(actual, expected_x)