Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for CFTimeIndex in get_clean_interp_index #3631

Merged
merged 39 commits into from
Jan 26, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
d5c2242
add support for CFTimeIndex in get_clean_interp_index
huard Dec 16, 2019
77bb24c
black
huard Dec 16, 2019
03e7769
added test comparing cftime index with standard index
huard Dec 16, 2019
e169cf4
added comment
huard Dec 16, 2019
303020f
index in ns instead of days
huard Dec 16, 2019
210fb94
pep8
huard Dec 16, 2019
1cfe72d
datetime_to_numeric: convert timedelta objects using np.timedelta64 t…
huard Dec 18, 2019
4964163
added interp test
huard Dec 18, 2019
83f6c89
switched clean_interp_index resolution to us. Fixed interpolate_na an…
huard Dec 18, 2019
6298953
Error message to explain overflow problem.
huard Dec 18, 2019
3d23ccf
Merge branch 'fix-3641' into cf_interp_index
huard Dec 19, 2019
2ba1803
switched timedelta64 units from ms to us
huard Dec 20, 2019
9a648d9
Merge branch 'fix-3641' into cf_interp_index
huard Dec 20, 2019
e873da2
reverted default user-visible resolution to ns. Converts to float, po…
huard Jan 6, 2020
532756d
pep8
huard Jan 6, 2020
73d8729
black
huard Jan 6, 2020
4288780
special case for older numpy versions
huard Jan 6, 2020
077145e
black
huard Jan 6, 2020
758d81c
added xfail for overflow error with numpy < 1.17
huard Jan 6, 2020
d0d8bfe
changes following PR comments from spencerclark
huard Jan 14, 2020
6c9630a
bypass pandas to convert timedeltas to floats. avoids overflow errors.
huard Jan 17, 2020
d18c775
black
huard Jan 17, 2020
78e17ec
Merge branch 'master' into cf_interp_index
huard Jan 17, 2020
6615c97
removed numpy conversion. added docstrings. renamed tests.
huard Jan 20, 2020
2df2b29
pep8
huard Jan 20, 2020
31f5417
updated whats new
huard Jan 20, 2020
2974af9
Update doc/whats-new.rst
huard Jan 20, 2020
eeb5074
update interpolate_na docstrings
huard Jan 20, 2020
6b9631f
black
huard Jan 20, 2020
5656fdb
dt conflicts with accessor
huard Jan 20, 2020
dcf98ff
replaced assert_equal by assert_allclose
huard Jan 24, 2020
4842a96
Update xarray/core/duck_array_ops.py
huard Jan 25, 2020
6dbf225
Update xarray/core/duck_array_ops.py
huard Jan 25, 2020
c90dc97
renamed array to value in timedelta_to_numeric. Added tests
huard Jan 25, 2020
71fb87d
removed support for TimedeltaIndex in timedelta_to_numeric
huard Jan 25, 2020
3d9f333
added tests for np_timedelta64_to_float and pd_timedelta_to_float. re…
huard Jan 26, 2020
b04785c
black
huard Jan 26, 2020
d24cae4
Fix flake8 error
spencerkclark Jan 26, 2020
6f0c504
black
spencerkclark Jan 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 69 additions & 42 deletions xarray/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from . import utils
from .common import _contains_datetime_like_objects, ones_like
from .computation import apply_ufunc
from .duck_array_ops import dask_array_type
from .duck_array_ops import dask_array_type, datetime_to_numeric
from .utils import OrderedSet, is_scalar
from .variable import Variable, broadcast_variables

Expand Down Expand Up @@ -207,52 +207,79 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs):


def get_clean_interp_index(arr, dim: Hashable, use_coordinate: Union[str, bool] = True):
"""get index to use for x values in interpolation.
"""Return index to use for x values in interpolation or curve fitting.

If use_coordinate is True, the coordinate that shares the name of the
dimension along which interpolation is being performed will be used as the
x values.
Parameters
----------
arr : DataArray
Array to interpolate or fit to a curve.
dim : str
Name of dimension along which to fit.
use_coordinate : str or bool
If use_coordinate is True, the coordinate that shares the name of the
dimension along which interpolation is being performed will be used as the
x values. If False, the x values are set as an equally spaced sequence.

Returns
-------
Variable
Numerical values for the x-coordinates.

If use_coordinate is False, the x values are set as an equally spaced
sequence.
Notes
-----
If indexing is along the time dimension, datetime coordinates are converted
to time deltas with respect to 1970-01-01.
"""
if use_coordinate:
if use_coordinate is True:
index = arr.get_index(dim)
else:
index = arr.coords[use_coordinate]
if index.ndim != 1:
raise ValueError(
f"Coordinates used for interpolation must be 1D, "
f"{use_coordinate} is {index.ndim}D."
)
index = index.to_index()

# TODO: index.name is None for multiindexes
# set name for nice error messages below
if isinstance(index, pd.MultiIndex):
index.name = dim

if not index.is_monotonic:
raise ValueError(f"Index {index.name!r} must be monotonically increasing")

if not index.is_unique:
raise ValueError(f"Index {index.name!r} has duplicate values")

# raise if index cannot be cast to a float (e.g. MultiIndex)
try:
index = index.values.astype(np.float64)
except (TypeError, ValueError):
# pandas raises a TypeError
# xarray/numpy raise a ValueError
raise TypeError(
f"Index {index.name!r} must be castable to float64 to support "
f"interpolation, got {type(index).__name__}."
)

else:
# Question: If use_coordinate is a string, what role does `dim` play?
from xarray.coding.cftimeindex import CFTimeIndex

if use_coordinate is False:
axis = arr.get_axis_num(dim)
index = np.arange(arr.shape[axis], dtype=np.float64)
return np.arange(arr.shape[axis], dtype=np.float64)

if use_coordinate is True:
index = arr.get_index(dim)

else: # string
index = arr.coords[use_coordinate]
if index.ndim != 1:
raise ValueError(
f"Coordinates used for interpolation must be 1D, "
f"{use_coordinate} is {index.ndim}D."
)
index = index.to_index()

# TODO: index.name is None for multiindexes
# set name for nice error messages below
if isinstance(index, pd.MultiIndex):
index.name = dim

if not index.is_monotonic:
raise ValueError(f"Index {index.name!r} must be monotonically increasing")

if not index.is_unique:
raise ValueError(f"Index {index.name!r} has duplicate values")

# Special case for non-standard calendar indexes
# Numerical datetime values are defined with respect to 1970-01-01T00:00:00 in units of nanoseconds
if isinstance(index, CFTimeIndex):
offset = type(index[0])(1970, 1, 1)
spencerkclark marked this conversation as resolved.
Show resolved Hide resolved
index = Variable(
data=datetime_to_numeric(index, offset=offset, datetime_unit="ns"),
spencerkclark marked this conversation as resolved.
Show resolved Hide resolved
dims=(dim,),
)

# raise if index cannot be cast to a float (e.g. MultiIndex)
try:
index = index.values.astype(np.float64)
except (TypeError, ValueError):
# pandas raises a TypeError
# xarray/numpy raise a ValueError
raise TypeError(
f"Index {index.name!r} must be castable to float64 to support "
f"interpolation, got {type(index).__name__}."
)

return index

Expand Down
52 changes: 52 additions & 0 deletions xarray/tests/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,29 @@
requires_bottleneck,
requires_dask,
requires_scipy,
requires_cftime,
)

from xarray.tests.test_cftime_offsets import _CFTIME_CALENDARS


@pytest.fixture
def da():
return xr.DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time")


@pytest.fixture
def cf_da():
def _cf_da(calendar, freq="1D"):
times = xr.cftime_range(
start="1970-01-01", freq=freq, periods=10, calendar=calendar
)
values = np.arange(10)
return xr.DataArray(values, dims=("time",), coords={"time": times})

return _cf_da


@pytest.fixture
def ds():
ds = xr.Dataset()
Expand Down Expand Up @@ -472,6 +487,43 @@ def test_interpolate_na_nan_block_lengths(y, lengths):
assert_equal(actual, expected)


@requires_cftime
@pytest.mark.parametrize("calendar", _CFTIME_CALENDARS)
def test_get_clean_interp_index_calendar(cf_da, calendar):
"""The index for CFTimeIndex is in units of days. This means that if two series using a 360 and 365 days
calendar each have a trend of .01C/year, the linear regression coefficients will be different because they
have different number of days.

Another option would be to have an index in units of years, but this would likely create other difficulties.
"""
i = get_clean_interp_index(cf_da(calendar), dim="time")
np.testing.assert_array_equal(i, np.arange(10) * 1e9 * 86400)


@requires_cftime
@pytest.mark.parametrize(
("calendar", "freq"), zip(["gregorian", "proleptic_gregorian"], ["1D", "1M", "1Y"])
)
def test_get_clean_interp_index_dt(cf_da, calendar, freq):
"""In the gregorian case, the index should be proportional to normal datetimes."""
g = cf_da(calendar, freq=freq)
g["stime"] = xr.Variable(data=g.time.to_index().to_datetimeindex(), dims=("time",))

gi = get_clean_interp_index(g, "time")
si = get_clean_interp_index(g, "time", use_coordinate="stime")
np.testing.assert_array_equal(gi, si)


@pytest.mark.xfail
def test_get_clean_interp_index_overflow():
spencerkclark marked this conversation as resolved.
Show resolved Hide resolved
da = xr.DataArray(
[0, 1, 2],
dims=("time",),
coords={"time": xr.cftime_range("0000-01-01", periods=3, calendar="360_day")},
)
get_clean_interp_index(da, "time")


@pytest.fixture
def da_time():
return xr.DataArray(
Expand Down