pydata · spencerkclark · Jan 26, 2020 · Dec 16, 2019 · Dec 16, 2019 · Dec 16, 2019
diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py
@@ -430,7 +430,14 @@ def __sub__(self, other):
         import cftime
 
         if isinstance(other, (CFTimeIndex, cftime.datetime)):
-            return pd.TimedeltaIndex(np.array(self) - np.array(other))
+            try:
+                return pd.TimedeltaIndex(np.array(self) - np.array(other))
+            except OverflowError:
+                raise ValueError(
+                    "The time difference exceeds the range of values "
+                    "that can be expressed at the nanosecond resolution."
+                )
+
         elif isinstance(other, pd.TimedeltaIndex):
             return CFTimeIndex(np.array(self) - other.to_pytimedelta())
         else:

diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -7,6 +7,7 @@
 import inspect
 import warnings
 from functools import partial
+from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
@@ -372,51 +373,137 @@ def _datetime_nanmin(array):
 
 
 def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
-    """Convert an array containing datetime-like data to an array of floats.
+    """Convert an array containing datetime-like data to numerical values.
+
+    Convert the datetime array to a timedelta relative to an offset.
 
     Parameters
     ----------
-    da : np.array
-        Input data
-    offset: Scalar with the same type of array or None
-        If None, subtract minimum values to reduce round off error
-    datetime_unit: None or any of {'Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms',
-        'us', 'ns', 'ps', 'fs', 'as'}
-    dtype: target dtype
+    da : array-like
+      Input data
+    offset: None, datetime or cftime.datetime
+      Datetime offset. If None, this is set by default to the array's minimum
+      value to reduce round off errors.
+    datetime_unit: {None, Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
+      If not None, convert output to a given datetime unit. Note that some
+      conversions are not allowed due to non-linear relationships between units.
+    dtype: dtype
+      Output dtype.
 
     Returns
     -------
     array
+      Numerical representation of datetime object relative to an offset.
+
+    Notes
+    -----
+    Some datetime unit conversions won't work, for example from days to years, even
+    though some calendars would allow for them (e.g. no_leap). This is because there
+    is no `cftime.timedelta` object.
     """
     # TODO: make this function dask-compatible?
+    # Set offset to minimum if not given
     if offset is None:
         if array.dtype.kind in "Mm":
             offset = _datetime_nanmin(array)
         else:
             offset = min(array)
+
+    # Compute timedelta object.
+    # For np.datetime64, this can silently yield garbage due to overflow.
+    # One option is to enforce 1970-01-01 as the universal offset.
     array = array - offset
 
-    if not hasattr(array, "dtype"):  # scalar is converted to 0d-array
+    # Scalar is converted to 0d-array
+    if not hasattr(array, "dtype"):
         array = np.array(array)
 
+    # Convert timedelta objects to float by first converting to microseconds.
     if array.dtype.kind in "O":
-        # possibly convert object array containing datetime.timedelta
-        array = np.asarray(pd.Series(array.ravel())).reshape(array.shape)
+        return py_timedelta_to_float(array, datetime_unit or "ns").astype(dtype)
 
-    if datetime_unit:
-        array = array / np.timedelta64(1, datetime_unit)
+    # Convert np.NaT to np.nan
+    elif array.dtype.kind in "mM":
 
-    # convert np.NaT to np.nan
-    if array.dtype.kind in "mM":
+        # Convert to specified timedelta units.
+        if datetime_unit:
+            array = array / np.timedelta64(1, datetime_unit)
         return np.where(isnull(array), np.nan, array.astype(dtype))
-    return array.astype(dtype)
+
+
+def timedelta_to_numeric(array, datetime_unit="ns", dtype=float):
+    """Convert an array containing timedelta-like data to numerical values.
+    """
+    import datetime as dt
+
+    if isinstance(array, dt.timedelta):
+        out = py_timedelta_to_float(array, datetime_unit)
+    elif isinstance(array, np.timedelta64):
+        out = np_timedelta64_to_float(array, datetime_unit)
+    elif isinstance(array, pd.Timedelta):
+        out = pd_timedelta_to_float(array, datetime_unit)
+    elif isinstance(array, pd.TimedeltaIndex):
+        out = pd_timedeltaindex_to_float(array, datetime_unit)
+    elif isinstance(array, str):
+        try:
+            a = pd.to_timedelta(array)
+        except ValueError:
+            raise ValueError(
+                f"Could not convert {array!r} to timedelta64 using pandas.to_timedelta"
+            )
+        return py_timedelta_to_float(a, datetime_unit)
+    else:
+        raise TypeError(
+            f"Expected array of type str, pandas.Timedelta, pandas.TimedeltaIndex, "
+            f"datetime.timedelta or numpy.timedelta64, but received {type(array).__name__}"
+        )
+    return out.astype(dtype)
 
 
 def _to_pytimedelta(array, unit="us"):
     index = pd.TimedeltaIndex(array.ravel(), unit=unit)
     return index.to_pytimedelta().reshape(array.shape)
 
 
+def np_timedelta64_to_float(array, datetime_unit):
+    array = array.astype("timedelta64[us]")
+    conversion_factor = np.timedelta64(1, "us") / np.timedelta64(1, datetime_unit)
+    return conversion_factor * array
+
+
+def pd_timedelta_to_float(array, datetime_units):
+    array = np.timedelta64(array.value, "ns").astype(np.float64)
+    return np_timedelta64_to_float(array, datetime_units)
+
+
+def pd_timedeltaindex_to_float(array, datetime_units):
+    return np_timedelta64_to_float(array.values, datetime_units)
+
+
+def py_timedelta_to_float(array, datetime_unit):
+    """Convert a timedelta object to a float, possibly at a loss of resolution.
+
+    Notes
+    -----
+    With Numpy >= 1.17, it's possible to convert directly from `datetime.timedelta`
+    to `numpy.timedelta64` at the microsecond (us) resolution. This covers a fairly
+    large span of years.
+
+    With earlier Numpy versions, the conversion only works at the nanosecond resolution,
+    which restricts the span that can be covered.
+    """
+    if LooseVersion(np.__version__) < LooseVersion("1.17"):
+        array = np.asarray(array)
+        array = (
+            np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6
+        )
+    else:
+        array = np.asarray(array).astype("timedelta64[us]").astype(np.float64)  # [us]
+
+    conversion_factor = np.timedelta64(1, "us") / np.timedelta64(1, datetime_unit)
+    return conversion_factor * array
+
+
 def mean(array, axis=None, skipna=None, **kwargs):
     """inhouse mean that can handle np.datetime64 or cftime.datetime
     dtypes"""

diff --git a/xarray/core/missing.py b/xarray/core/missing.py
@@ -2,14 +2,15 @@
 from functools import partial
 from numbers import Number
 from typing import Any, Callable, Dict, Hashable, Sequence, Union
+import datetime as dt
 
 import numpy as np
 import pandas as pd
 
 from . import utils
 from .common import _contains_datetime_like_objects, ones_like
 from .computation import apply_ufunc
-from .duck_array_ops import dask_array_type
+from .duck_array_ops import dask_array_type, datetime_to_numeric, timedelta_to_numeric
 from .utils import OrderedSet, is_scalar
 from .variable import Variable, broadcast_variables
 
@@ -207,52 +208,81 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs):
 
 
 def get_clean_interp_index(arr, dim: Hashable, use_coordinate: Union[str, bool] = True):
-    """get index to use for x values in interpolation.
+    """Return index to use for x values in interpolation or curve fitting.
 
-    If use_coordinate is True, the coordinate that shares the name of the
-    dimension along which interpolation is being performed will be used as the
-    x values.
+    Parameters
+    ----------
+    arr : DataArray
+      Array to interpolate or fit to a curve.
+    dim : str
+      Name of dimension along which to fit.
+    use_coordinate : str or bool
+      If use_coordinate is True, the coordinate that shares the name of the
+      dimension along which interpolation is being performed will be used as the
+      x values. If False, the x values are set as an equally spaced sequence.
+
+    Returns
+    -------
+    Variable
+      Numerical values for the x-coordinates.
 
-    If use_coordinate is False, the x values are set as an equally spaced
-    sequence.
+    Notes
+    -----
+    If indexing is along the time dimension, datetime coordinates are converted
+    to time deltas with respect to 1970-01-01.
     """
-    if use_coordinate:
-        if use_coordinate is True:
-            index = arr.get_index(dim)
-        else:
-            index = arr.coords[use_coordinate]
-            if index.ndim != 1:
-                raise ValueError(
-                    f"Coordinates used for interpolation must be 1D, "
-                    f"{use_coordinate} is {index.ndim}D."
-                )
-            index = index.to_index()
-
-        # TODO: index.name is None for multiindexes
-        # set name for nice error messages below
-        if isinstance(index, pd.MultiIndex):
-            index.name = dim
-
-        if not index.is_monotonic:
-            raise ValueError(f"Index {index.name!r} must be monotonically increasing")
-
-        if not index.is_unique:
-            raise ValueError(f"Index {index.name!r} has duplicate values")
-
-        # raise if index cannot be cast to a float (e.g. MultiIndex)
-        try:
-            index = index.values.astype(np.float64)
-        except (TypeError, ValueError):
-            # pandas raises a TypeError
-            # xarray/numpy raise a ValueError
-            raise TypeError(
-                f"Index {index.name!r} must be castable to float64 to support "
-                f"interpolation, got {type(index).__name__}."
-            )
 
-    else:
+    # Question: If use_coordinate is a string, what role does `dim` play?
+    from xarray.coding.cftimeindex import CFTimeIndex
+
+    if use_coordinate is False:
         axis = arr.get_axis_num(dim)
-        index = np.arange(arr.shape[axis], dtype=np.float64)
+        return np.arange(arr.shape[axis], dtype=np.float64)
+
+    if use_coordinate is True:
+        index = arr.get_index(dim)
+
+    else:  # string
+        index = arr.coords[use_coordinate]
+        if index.ndim != 1:
+            raise ValueError(
+                f"Coordinates used for interpolation must be 1D, "
+                f"{use_coordinate} is {index.ndim}D."
+            )
+        index = index.to_index()
+
+    # TODO: index.name is None for multiindexes
+    # set name for nice error messages below
+    if isinstance(index, pd.MultiIndex):
+        index.name = dim
+
+    if not index.is_monotonic:
+        raise ValueError(f"Index {index.name!r} must be monotonically increasing")
+
+    if not index.is_unique:
+        raise ValueError(f"Index {index.name!r} has duplicate values")
+
+    # Special case for non-standard calendar indexes
+    # Numerical datetime values are defined with respect to 1970-01-01T00:00:00 in units of nanoseconds
+    if isinstance(index, (CFTimeIndex, pd.DatetimeIndex)):
+        offset = type(index[0])(1970, 1, 1)
+        if isinstance(index, CFTimeIndex):
+            index = index.values
+        index = Variable(
+            data=datetime_to_numeric(index, offset=offset, datetime_unit="ns"),
+            dims=(dim,),
+        )
+
+    # raise if index cannot be cast to a float (e.g. MultiIndex)
+    try:
+        index = index.values.astype(np.float64)
+    except (TypeError, ValueError):
+        # pandas raises a TypeError
+        # xarray/numpy raise a ValueError
+        raise TypeError(
+            f"Index {index.name!r} must be castable to float64 to support "
+            f"interpolation, got {type(index).__name__}."
+        )
 
     return index
 
@@ -263,11 +293,13 @@ def interp_na(
     use_coordinate: Union[bool, str] = True,
     method: str = "linear",
     limit: int = None,
-    max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
+    max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64, dt.timedelta] = None,
     **kwargs,
 ):
     """Interpolate values according to different methods.
     """
+    from xarray.coding.cftimeindex import CFTimeIndex
+
     if dim is None:
         raise NotImplementedError("dim is a required argument")
 
@@ -281,26 +313,11 @@ def interp_na(
 
         if (
             dim in self.indexes
-            and isinstance(self.indexes[dim], pd.DatetimeIndex)
+            and isinstance(self.indexes[dim], (pd.DatetimeIndex, CFTimeIndex))
             and use_coordinate
         ):
-            if not isinstance(max_gap, (np.timedelta64, pd.Timedelta, str)):
-                raise TypeError(
-                    f"Underlying index is DatetimeIndex. Expected max_gap of type str, pandas.Timedelta or numpy.timedelta64 but received {max_type}"
-                )
-
-            if isinstance(max_gap, str):
-                try:
-                    max_gap = pd.to_timedelta(max_gap)
-                except ValueError:
-                    raise ValueError(
-                        f"Could not convert {max_gap!r} to timedelta64 using pandas.to_timedelta"
-                    )
-
-            if isinstance(max_gap, pd.Timedelta):
-                max_gap = np.timedelta64(max_gap.value, "ns")
-
-            max_gap = np.timedelta64(max_gap, "ns").astype(np.float64)
+            # Convert to float
+            max_gap = timedelta_to_numeric(max_gap)
 
         if not use_coordinate:
             if not isinstance(max_gap, (Number, np.number)):