Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for CFTimeIndex in get_clean_interp_index #3631

Merged
merged 39 commits into from
Jan 26, 2020
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
d5c2242
add support for CFTimeIndex in get_clean_interp_index
huard Dec 16, 2019
77bb24c
black
huard Dec 16, 2019
03e7769
added test comparing cftime index with standard index
huard Dec 16, 2019
e169cf4
added comment
huard Dec 16, 2019
303020f
index in ns instead of days
huard Dec 16, 2019
210fb94
pep8
huard Dec 16, 2019
1cfe72d
datetime_to_numeric: convert timedelta objects using np.timedelta64 t…
huard Dec 18, 2019
4964163
added interp test
huard Dec 18, 2019
83f6c89
switched clean_interp_index resolution to us. Fixed interpolate_na an…
huard Dec 18, 2019
6298953
Error message to explain overflow problem.
huard Dec 18, 2019
3d23ccf
Merge branch 'fix-3641' into cf_interp_index
huard Dec 19, 2019
2ba1803
switched timedelta64 units from ms to us
huard Dec 20, 2019
9a648d9
Merge branch 'fix-3641' into cf_interp_index
huard Dec 20, 2019
e873da2
reverted default user-visible resolution to ns. Converts to float, po…
huard Jan 6, 2020
532756d
pep8
huard Jan 6, 2020
73d8729
black
huard Jan 6, 2020
4288780
special case for older numpy versions
huard Jan 6, 2020
077145e
black
huard Jan 6, 2020
758d81c
added xfail for overflow error with numpy < 1.17
huard Jan 6, 2020
d0d8bfe
changes following PR comments from spencerclark
huard Jan 14, 2020
6c9630a
bypass pandas to convert timedeltas to floats. avoids overflow errors.
huard Jan 17, 2020
d18c775
black
huard Jan 17, 2020
78e17ec
Merge branch 'master' into cf_interp_index
huard Jan 17, 2020
6615c97
removed numpy conversion. added docstrings. renamed tests.
huard Jan 20, 2020
2df2b29
pep8
huard Jan 20, 2020
31f5417
updated whats new
huard Jan 20, 2020
2974af9
Update doc/whats-new.rst
huard Jan 20, 2020
eeb5074
update interpolate_na docstrings
huard Jan 20, 2020
6b9631f
black
huard Jan 20, 2020
5656fdb
dt conflicts with accessor
huard Jan 20, 2020
dcf98ff
replaced assert_equal by assert_allclose
huard Jan 24, 2020
4842a96
Update xarray/core/duck_array_ops.py
huard Jan 25, 2020
6dbf225
Update xarray/core/duck_array_ops.py
huard Jan 25, 2020
c90dc97
renamed array to value in timedelta_to_numeric. Added tests
huard Jan 25, 2020
71fb87d
removed support for TimedeltaIndex in timedelta_to_numeric
huard Jan 25, 2020
3d9f333
added tests for np_timedelta64_to_float and pd_timedelta_to_float. re…
huard Jan 26, 2020
b04785c
black
huard Jan 26, 2020
d24cae4
Fix flake8 error
spencerkclark Jan 26, 2020
6f0c504
black
spencerkclark Jan 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion xarray/coding/cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,14 @@ def __sub__(self, other):
import cftime

if isinstance(other, (CFTimeIndex, cftime.datetime)):
return pd.TimedeltaIndex(np.array(self) - np.array(other))
try:
return pd.TimedeltaIndex(np.array(self) - np.array(other))
except OverflowError:
raise ValueError(
huard marked this conversation as resolved.
Show resolved Hide resolved
"The time difference exceeds the range of values "
"that can be expressed at the nanosecond resolution."
)

elif isinstance(other, pd.TimedeltaIndex):
return CFTimeIndex(np.array(self) - other.to_pytimedelta())
else:
Expand Down
119 changes: 103 additions & 16 deletions xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import inspect
import warnings
from functools import partial
from distutils.version import LooseVersion

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -372,51 +373,137 @@ def _datetime_nanmin(array):


def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
"""Convert an array containing datetime-like data to an array of floats.
"""Convert an array containing datetime-like data to numerical values.

Convert the datetime array to a timedelta relative to an offset.

Parameters
----------
da : np.array
Input data
offset: Scalar with the same type of array or None
If None, subtract minimum values to reduce round off error
datetime_unit: None or any of {'Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms',
'us', 'ns', 'ps', 'fs', 'as'}
dtype: target dtype
da : array-like
Input data
offset: None, datetime or cftime.datetime
Datetime offset. If None, this is set by default to the array's minimum
value to reduce round off errors.
datetime_unit: {None, Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
If not None, convert output to a given datetime unit. Note that some
conversions are not allowed due to non-linear relationships between units.
dtype: dtype
Output dtype.

Returns
-------
array
Numerical representation of datetime object relative to an offset.

Notes
-----
Some datetime unit conversions won't work, for example from days to years, even
though some calendars would allow for them (e.g. no_leap). This is because there
is no `cftime.timedelta` object.
"""
# TODO: make this function dask-compatible?
# Set offset to minimum if not given
if offset is None:
if array.dtype.kind in "Mm":
offset = _datetime_nanmin(array)
else:
offset = min(array)

# Compute timedelta object.
# For np.datetime64, this can silently yield garbage due to overflow.
# One option is to enforce 1970-01-01 as the universal offset.
huard marked this conversation as resolved.
Show resolved Hide resolved
array = array - offset

if not hasattr(array, "dtype"): # scalar is converted to 0d-array
# Scalar is converted to 0d-array
if not hasattr(array, "dtype"):
array = np.array(array)

# Convert timedelta objects to float by first converting to microseconds.
if array.dtype.kind in "O":
# possibly convert object array containing datetime.timedelta
array = np.asarray(pd.Series(array.ravel())).reshape(array.shape)
return py_timedelta_to_float(array, datetime_unit or "ns").astype(dtype)

if datetime_unit:
array = array / np.timedelta64(1, datetime_unit)
# Convert np.NaT to np.nan
elif array.dtype.kind in "mM":

# convert np.NaT to np.nan
if array.dtype.kind in "mM":
# Convert to specified timedelta units.
if datetime_unit:
array = array / np.timedelta64(1, datetime_unit)
return np.where(isnull(array), np.nan, array.astype(dtype))
return array.astype(dtype)


def timedelta_to_numeric(array, datetime_unit="ns", dtype=float):
"""Convert an array containing timedelta-like data to numerical values.
huard marked this conversation as resolved.
Show resolved Hide resolved
"""
import datetime as dt

if isinstance(array, dt.timedelta):
out = py_timedelta_to_float(array, datetime_unit)
elif isinstance(array, np.timedelta64):
out = np_timedelta64_to_float(array, datetime_unit)
elif isinstance(array, pd.Timedelta):
out = pd_timedelta_to_float(array, datetime_unit)
elif isinstance(array, pd.TimedeltaIndex):
huard marked this conversation as resolved.
Show resolved Hide resolved
out = pd_timedeltaindex_to_float(array, datetime_unit)
elif isinstance(array, str):
try:
a = pd.to_timedelta(array)
except ValueError:
raise ValueError(
f"Could not convert {array!r} to timedelta64 using pandas.to_timedelta"
)
return py_timedelta_to_float(a, datetime_unit)
else:
raise TypeError(
f"Expected array of type str, pandas.Timedelta, pandas.TimedeltaIndex, "
f"datetime.timedelta or numpy.timedelta64, but received {type(array).__name__}"
)
return out.astype(dtype)


def _to_pytimedelta(array, unit="us"):
index = pd.TimedeltaIndex(array.ravel(), unit=unit)
return index.to_pytimedelta().reshape(array.shape)


def np_timedelta64_to_float(array, datetime_unit):
huard marked this conversation as resolved.
Show resolved Hide resolved
array = array.astype("timedelta64[us]")
huard marked this conversation as resolved.
Show resolved Hide resolved
conversion_factor = np.timedelta64(1, "us") / np.timedelta64(1, datetime_unit)
return conversion_factor * array


def pd_timedelta_to_float(array, datetime_units):
huard marked this conversation as resolved.
Show resolved Hide resolved
array = np.timedelta64(array.value, "ns").astype(np.float64)
huard marked this conversation as resolved.
Show resolved Hide resolved
return np_timedelta64_to_float(array, datetime_units)


def pd_timedeltaindex_to_float(array, datetime_units):
huard marked this conversation as resolved.
Show resolved Hide resolved
return np_timedelta64_to_float(array.values, datetime_units)


def py_timedelta_to_float(array, datetime_unit):
"""Convert a timedelta object to a float, possibly at a loss of resolution.

Notes
-----
With Numpy >= 1.17, it's possible to convert directly from `datetime.timedelta`
to `numpy.timedelta64` at the microsecond (us) resolution. This covers a fairly
large span of years.

With earlier Numpy versions, the conversion only works at the nanosecond resolution,
spencerkclark marked this conversation as resolved.
Show resolved Hide resolved
which restricts the span that can be covered.
"""
if LooseVersion(np.__version__) < LooseVersion("1.17"):
array = np.asarray(array)
array = (
np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6
huard marked this conversation as resolved.
Show resolved Hide resolved
)
else:
array = np.asarray(array).astype("timedelta64[us]").astype(np.float64) # [us]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the potential for loss of precision is here, but only when array comes in with dtype np.timedelta64[ns] or finer. I feel like we can guard against this; we could do this in one of two ways:

  1. Only pass dtype object arrays to this function and write a separate function to handle np.timedelta64 arrays.
  2. Add some logic to handle np.timedelta64 arrays more robustly in this function, and name it something that reflects that both standard library timedeltas and NumPy timedeltas can be safely passed to it.

I probably prefer (1) as a solution.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there are two cases where we're losing precision:

  1. Conversion from ns to us when the time steps are < 1 us.
  2. Conversion to float of large timedelta[us] values. For example:
In [154]: np.float(13510798882111486) - np.float(13510798882111485)                                                                                                                                         
Out[154]: 2.0

When we discussed about loss of precision, I was more worried about 2 than 1, hence maybe some confusion earlier. I'm concerned 2 will lead to hard to track bugs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When we discussed about loss of precision, I was more worried about 2 than 1, hence maybe some confusion earlier. I'm concerned 2 will lead to hard to track bugs.

Ah, that makes sense. Indeed I had mainly been thinking about 1. Thanks for the clear example. My apologies for being dense earlier. At least in the short term, I guess I'm still not so worried about 2, primarily because it is already an issue in the existing code (i.e. even for np.datetime64[ns] dates). So this update is not changing the behavior materially; it's just extending the approach to allow for long time ranges between cftime dates1.

Earlier you raised a good question regarding whether there would be a benefit to keeping these values as integers as long as possible to avoid these floating point problems -- it seems like there could be in the case of differentiate and integrate -- however, that would probably require some refactoring to do the unit conversion after the operation involving the numeric times, rather than before, to avoid overflow. I'd be open to discussing this more, but I don't feel like it's necessarily a blocker here, unless you have an example that clearly shows otherwise.

Regardless, I really appreciate your careful thoughts and help with this. This has been useful discussion.


1I guess it's also important to note that we are currently naively differencing cftime dates to produce timedeltas, which even before float conversion is not microsecond-exact:

In [1]: import cftime

In [2]: cftime.DatetimeNoLeap(2000, 1, 1, 0, 0, 0, 123456) - cftime.DatetimeNoLeap(2000, 1, 1, 0, 0, 0, 123455)
Out[2]: datetime.timedelta(microseconds=8)

We have worked around this before in xarray; see exact_cftime_datetime_difference.


conversion_factor = np.timedelta64(1, "us") / np.timedelta64(1, datetime_unit)
return conversion_factor * array


def mean(array, axis=None, skipna=None, **kwargs):
"""inhouse mean that can handle np.datetime64 or cftime.datetime
dtypes"""
Expand Down
139 changes: 78 additions & 61 deletions xarray/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
from functools import partial
from numbers import Number
from typing import Any, Callable, Dict, Hashable, Sequence, Union
import datetime as dt

import numpy as np
import pandas as pd

from . import utils
from .common import _contains_datetime_like_objects, ones_like
from .computation import apply_ufunc
from .duck_array_ops import dask_array_type
from .duck_array_ops import dask_array_type, datetime_to_numeric, timedelta_to_numeric
from .utils import OrderedSet, is_scalar
from .variable import Variable, broadcast_variables

Expand Down Expand Up @@ -207,52 +208,81 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs):


def get_clean_interp_index(arr, dim: Hashable, use_coordinate: Union[str, bool] = True):
"""get index to use for x values in interpolation.
"""Return index to use for x values in interpolation or curve fitting.

If use_coordinate is True, the coordinate that shares the name of the
dimension along which interpolation is being performed will be used as the
x values.
Parameters
----------
arr : DataArray
Array to interpolate or fit to a curve.
dim : str
Name of dimension along which to fit.
use_coordinate : str or bool
If use_coordinate is True, the coordinate that shares the name of the
dimension along which interpolation is being performed will be used as the
x values. If False, the x values are set as an equally spaced sequence.

Returns
-------
Variable
Numerical values for the x-coordinates.

If use_coordinate is False, the x values are set as an equally spaced
sequence.
Notes
-----
If indexing is along the time dimension, datetime coordinates are converted
to time deltas with respect to 1970-01-01.
"""
if use_coordinate:
if use_coordinate is True:
index = arr.get_index(dim)
else:
index = arr.coords[use_coordinate]
if index.ndim != 1:
raise ValueError(
f"Coordinates used for interpolation must be 1D, "
f"{use_coordinate} is {index.ndim}D."
)
index = index.to_index()

# TODO: index.name is None for multiindexes
# set name for nice error messages below
if isinstance(index, pd.MultiIndex):
index.name = dim

if not index.is_monotonic:
raise ValueError(f"Index {index.name!r} must be monotonically increasing")

if not index.is_unique:
raise ValueError(f"Index {index.name!r} has duplicate values")

# raise if index cannot be cast to a float (e.g. MultiIndex)
try:
index = index.values.astype(np.float64)
except (TypeError, ValueError):
# pandas raises a TypeError
# xarray/numpy raise a ValueError
raise TypeError(
f"Index {index.name!r} must be castable to float64 to support "
f"interpolation, got {type(index).__name__}."
)

else:
# Question: If use_coordinate is a string, what role does `dim` play?
from xarray.coding.cftimeindex import CFTimeIndex

if use_coordinate is False:
axis = arr.get_axis_num(dim)
index = np.arange(arr.shape[axis], dtype=np.float64)
return np.arange(arr.shape[axis], dtype=np.float64)

if use_coordinate is True:
index = arr.get_index(dim)

else: # string
index = arr.coords[use_coordinate]
if index.ndim != 1:
raise ValueError(
f"Coordinates used for interpolation must be 1D, "
f"{use_coordinate} is {index.ndim}D."
)
index = index.to_index()

# TODO: index.name is None for multiindexes
# set name for nice error messages below
if isinstance(index, pd.MultiIndex):
index.name = dim

if not index.is_monotonic:
raise ValueError(f"Index {index.name!r} must be monotonically increasing")

if not index.is_unique:
raise ValueError(f"Index {index.name!r} has duplicate values")

# Special case for non-standard calendar indexes
# Numerical datetime values are defined with respect to 1970-01-01T00:00:00 in units of nanoseconds
if isinstance(index, (CFTimeIndex, pd.DatetimeIndex)):
offset = type(index[0])(1970, 1, 1)
spencerkclark marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(index, CFTimeIndex):
index = index.values
index = Variable(
data=datetime_to_numeric(index, offset=offset, datetime_unit="ns"),
spencerkclark marked this conversation as resolved.
Show resolved Hide resolved
dims=(dim,),
)

# raise if index cannot be cast to a float (e.g. MultiIndex)
try:
index = index.values.astype(np.float64)
except (TypeError, ValueError):
# pandas raises a TypeError
# xarray/numpy raise a ValueError
raise TypeError(
f"Index {index.name!r} must be castable to float64 to support "
f"interpolation, got {type(index).__name__}."
)

return index

Expand All @@ -263,11 +293,13 @@ def interp_na(
use_coordinate: Union[bool, str] = True,
method: str = "linear",
limit: int = None,
max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64, dt.timedelta] = None,
**kwargs,
):
"""Interpolate values according to different methods.
"""
from xarray.coding.cftimeindex import CFTimeIndex

if dim is None:
raise NotImplementedError("dim is a required argument")

Expand All @@ -281,26 +313,11 @@ def interp_na(

if (
dim in self.indexes
and isinstance(self.indexes[dim], pd.DatetimeIndex)
and isinstance(self.indexes[dim], (pd.DatetimeIndex, CFTimeIndex))
and use_coordinate
):
if not isinstance(max_gap, (np.timedelta64, pd.Timedelta, str)):
raise TypeError(
f"Underlying index is DatetimeIndex. Expected max_gap of type str, pandas.Timedelta or numpy.timedelta64 but received {max_type}"
)

if isinstance(max_gap, str):
try:
max_gap = pd.to_timedelta(max_gap)
except ValueError:
raise ValueError(
f"Could not convert {max_gap!r} to timedelta64 using pandas.to_timedelta"
)

if isinstance(max_gap, pd.Timedelta):
max_gap = np.timedelta64(max_gap.value, "ns")

max_gap = np.timedelta64(max_gap, "ns").astype(np.float64)
# Convert to float
max_gap = timedelta_to_numeric(max_gap)
huard marked this conversation as resolved.
Show resolved Hide resolved

if not use_coordinate:
if not isinstance(max_gap, (Number, np.number)):
Expand Down
Loading