diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f19908b02b8..2bdbc74b4b7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -73,6 +73,8 @@ Bug fixes with size one in some dimension can now be plotted, which is good for exploring satellite imagery (:issue:`1780`). By `Zac Hatfield-Dodds `_. +- Fixed ``UnboundLocalError`` when opening netCDF file `` (:issue:`1781`). + By `Stephan Hoyer `_. - The ``variables``, ``attrs``, and ``dimensions`` properties have been deprecated as part of a bug fix addressing an issue where backends were unintentionally loading the datastores data and attributes repeatedly during diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 30ea51811c4..b4645aa8071 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -6,6 +6,7 @@ import numpy as np +from .. import coding from .. import Variable from ..core import indexing from ..core.utils import FrozenOrderedDict, HiddenKeyDict @@ -259,13 +260,13 @@ def encode_zarr_variable(var, needs_copy=True, name=None): raise NotImplementedError("Variable `%s` is an object. Zarr " "store can't yet encode objects." % name) - var = conventions.maybe_encode_datetime(var, name=name) - var = conventions.maybe_encode_timedelta(var, name=name) - var, needs_copy = conventions.maybe_encode_offset_and_scale(var, - needs_copy, - name=name) - var, needs_copy = conventions.maybe_encode_fill_value(var, needs_copy, - name=name) + for coder in [coding.times.CFDatetimeCoder(), + coding.times.CFTimedeltaCoder(), + coding.variables.CFScaleOffsetCoder(), + coding.variables.CFMaskCoder(), + coding.variables.UnsignedIntegerCoder()]: + var = coder.encode(var, name=name) + var = conventions.maybe_encode_nonstring_dtype(var, name=name) var = conventions.maybe_default_fill_value(var) var = conventions.maybe_encode_bools(var) diff --git a/xarray/coding/times.py b/xarray/coding/times.py new file mode 100644 index 00000000000..e00769af884 --- /dev/null +++ b/xarray/coding/times.py @@ -0,0 +1,363 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import traceback +import warnings +from datetime import datetime +from functools import partial + +import numpy as np + +import pandas as pd +try: + from pandas.errors import OutOfBoundsDatetime +except ImportError: + # pandas < 0.20 + from pandas.tslib import OutOfBoundsDatetime + +from .variables import (SerializationWarning, VariableCoder, + lazy_elemwise_func, pop_to, safe_setitem, + unpack_for_decoding, unpack_for_encoding) +from ..core import indexing +from ..core.formatting import first_n_items, format_timestamp, last_item +from ..core.pycompat import PY3 +from ..core.variable import Variable + + +# standard calendars recognized by netcdftime +_STANDARD_CALENDARS = set(['standard', 'gregorian', 'proleptic_gregorian']) + +_NS_PER_TIME_DELTA = {'us': int(1e3), + 'ms': int(1e6), + 's': int(1e9), + 'm': int(1e9) * 60, + 'h': int(1e9) * 60 * 60, + 'D': int(1e9) * 60 * 60 * 24} + +TIME_UNITS = frozenset(['days', 'hours', 'minutes', 'seconds', + 'milliseconds', 'microseconds']) + + +def _netcdf_to_numpy_timeunit(units): + units = units.lower() + if not units.endswith('s'): + units = '%ss' % units + return {'microseconds': 'us', 'milliseconds': 'ms', 'seconds': 's', + 'minutes': 'm', 'hours': 'h', 'days': 'D'}[units] + + +def _unpack_netcdf_time_units(units): + # CF datetime units follow the format: "UNIT since DATE" + # this parses out the unit and date allowing for extraneous + # whitespace. + matches = re.match('(.+) since (.+)', units) + if not matches: + raise ValueError('invalid time units: %s' % units) + delta_units, ref_date = [s.strip() for s in matches.groups()] + return delta_units, ref_date + + +def _decode_datetime_with_netcdf4(num_dates, units, calendar): + import netCDF4 as nc4 + + dates = np.asarray(nc4.num2date(num_dates, units, calendar)) + if (dates[np.nanargmin(num_dates)].year < 1678 or + dates[np.nanargmax(num_dates)].year >= 2262): + warnings.warn('Unable to decode time axis into full ' + 'numpy.datetime64 objects, continuing using dummy ' + 'netCDF4.datetime objects instead, reason: dates out' + ' of range', SerializationWarning, stacklevel=3) + else: + try: + dates = nctime_to_nptime(dates) + except ValueError as e: + warnings.warn('Unable to decode time axis into full ' + 'numpy.datetime64 objects, continuing using ' + 'dummy netCDF4.datetime objects instead, reason:' + '{0}'.format(e), SerializationWarning, stacklevel=3) + return dates + + +def _decode_cf_datetime_dtype(data, units, calendar): + # Verify that at least the first and last date can be decoded + # successfully. Otherwise, tracebacks end up swallowed by + # Dataset.__repr__ when users try to view their lazily decoded array. + values = indexing.ImplicitToExplicitIndexingAdapter( + indexing.as_indexable(data)) + example_value = np.concatenate([first_n_items(values, 1) or [0], + last_item(values) or [0]]) + + try: + result = decode_cf_datetime(example_value, units, calendar) + except Exception: + calendar_msg = ('the default calendar' if calendar is None + else 'calendar %r' % calendar) + msg = ('unable to decode time units %r with %s. Try ' + 'opening your dataset with decode_times=False.' + % (units, calendar_msg)) + if not PY3: + msg += ' Full traceback:\n' + traceback.format_exc() + raise ValueError(msg) + else: + dtype = getattr(result, 'dtype', np.dtype('object')) + + return dtype + + +def decode_cf_datetime(num_dates, units, calendar=None): + """Given an array of numeric dates in netCDF format, convert it into a + numpy array of date time objects. + + For standard (Gregorian) calendars, this function uses vectorized + operations, which makes it much faster than netCDF4.num2date. In such a + case, the returned array will be of type np.datetime64. + + Note that time unit in `units` must not be smaller than microseconds and + not larger than days. + + See also + -------- + netCDF4.num2date + """ + num_dates = np.asarray(num_dates) + flat_num_dates = num_dates.ravel() + if calendar is None: + calendar = 'standard' + + delta, ref_date = _unpack_netcdf_time_units(units) + + try: + if calendar not in _STANDARD_CALENDARS: + raise OutOfBoundsDatetime + + delta = _netcdf_to_numpy_timeunit(delta) + try: + ref_date = pd.Timestamp(ref_date) + except ValueError: + # ValueError is raised by pd.Timestamp for non-ISO timestamp + # strings, in which case we fall back to using netCDF4 + raise OutOfBoundsDatetime + + # fixes: https://github.com/pydata/pandas/issues/14068 + # these lines check if the the lowest or the highest value in dates + # cause an OutOfBoundsDatetime (Overflow) error + pd.to_timedelta(flat_num_dates.min(), delta) + ref_date + pd.to_timedelta(flat_num_dates.max(), delta) + ref_date + + # Cast input dates to integers of nanoseconds because `pd.to_datetime` + # works much faster when dealing with integers + flat_num_dates_ns_int = (flat_num_dates * + _NS_PER_TIME_DELTA[delta]).astype(np.int64) + + dates = (pd.to_timedelta(flat_num_dates_ns_int, 'ns') + + ref_date).values + + except (OutOfBoundsDatetime, OverflowError): + dates = _decode_datetime_with_netcdf4(flat_num_dates.astype(np.float), + units, + calendar) + + return dates.reshape(num_dates.shape) + + +def decode_cf_timedelta(num_timedeltas, units): + """Given an array of numeric timedeltas in netCDF format, convert it into a + numpy timedelta64[ns] array. + """ + num_timedeltas = np.asarray(num_timedeltas) + units = _netcdf_to_numpy_timeunit(units) + + shape = num_timedeltas.shape + num_timedeltas = num_timedeltas.ravel() + + result = pd.to_timedelta(num_timedeltas, unit=units, box=False) + # NaT is returned unboxed with wrong units; this should be fixed in pandas + if result.dtype != 'timedelta64[ns]': + result = result.astype('timedelta64[ns]') + return result.reshape(shape) + + +def _infer_time_units_from_diff(unique_timedeltas): + for time_unit in ['days', 'hours', 'minutes', 'seconds']: + delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)] + unit_delta = np.timedelta64(delta_ns, 'ns') + diffs = unique_timedeltas / unit_delta + if np.all(diffs == diffs.astype(int)): + return time_unit + return 'seconds' + + +def infer_datetime_units(dates): + """Given an array of datetimes, returns a CF compatible time-unit string of + the form "{time_unit} since {date[0]}", where `time_unit` is 'days', + 'hours', 'minutes' or 'seconds' (the first one that can evenly divide all + unique time deltas in `dates`) + """ + dates = pd.to_datetime(np.asarray(dates).ravel(), box=False) + dates = dates[pd.notnull(dates)] + unique_timedeltas = np.unique(np.diff(dates)) + units = _infer_time_units_from_diff(unique_timedeltas) + reference_date = dates[0] if len(dates) > 0 else '1970-01-01' + return '%s since %s' % (units, pd.Timestamp(reference_date)) + + +def infer_timedelta_units(deltas): + """Given an array of timedeltas, returns a CF compatible time-unit from + {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly + divide all unique time deltas in `deltas`) + """ + deltas = pd.to_timedelta(np.asarray(deltas).ravel(), box=False) + unique_timedeltas = np.unique(deltas[pd.notnull(deltas)]) + units = _infer_time_units_from_diff(unique_timedeltas) + return units + + +def nctime_to_nptime(times): + """Given an array of netCDF4.datetime objects, return an array of + numpy.datetime64 objects of the same size""" + times = np.asarray(times) + new = np.empty(times.shape, dtype='M8[ns]') + for i, t in np.ndenumerate(times): + dt = datetime(t.year, t.month, t.day, t.hour, t.minute, t.second) + new[i] = np.datetime64(dt) + return new + + +def _cleanup_netcdf_time_units(units): + delta, ref_date = _unpack_netcdf_time_units(units) + try: + units = '%s since %s' % (delta, format_timestamp(ref_date)) + except OutOfBoundsDatetime: + # don't worry about reifying the units if they're out of bounds + pass + return units + + +def _encode_datetime_with_netcdf4(dates, units, calendar): + """Fallback method for encoding dates using netCDF4-python. + + This method is more flexible than xarray's parsing using datetime64[ns] + arrays but also slower because it loops over each element. + """ + import netCDF4 as nc4 + + if np.issubdtype(dates.dtype, np.datetime64): + # numpy's broken datetime conversion only works for us precision + dates = dates.astype('M8[us]').astype(datetime) + + def encode_datetime(d): + return np.nan if d is None else nc4.date2num(d, units, calendar) + + return np.vectorize(encode_datetime)(dates) + + +def cast_to_int_if_safe(num): + int_num = np.array(num, dtype=np.int64) + if (num == int_num).all(): + num = int_num + return num + + +def encode_cf_datetime(dates, units=None, calendar=None): + """Given an array of datetime objects, returns the tuple `(num, units, + calendar)` suitable for a CF compliant time variable. + + Unlike `date2num`, this function can handle datetime64 arrays. + + See also + -------- + netCDF4.date2num + """ + dates = np.asarray(dates) + + if units is None: + units = infer_datetime_units(dates) + else: + units = _cleanup_netcdf_time_units(units) + + if calendar is None: + calendar = 'proleptic_gregorian' + + delta, ref_date = _unpack_netcdf_time_units(units) + try: + if calendar not in _STANDARD_CALENDARS or dates.dtype.kind == 'O': + # parse with netCDF4 instead + raise OutOfBoundsDatetime + assert dates.dtype == 'datetime64[ns]' + + delta_units = _netcdf_to_numpy_timeunit(delta) + time_delta = np.timedelta64(1, delta_units).astype('timedelta64[ns]') + ref_date = np.datetime64(pd.Timestamp(ref_date)) + num = (dates - ref_date) / time_delta + + except (OutOfBoundsDatetime, OverflowError): + num = _encode_datetime_with_netcdf4(dates, units, calendar) + + num = cast_to_int_if_safe(num) + return (num, units, calendar) + + +def encode_cf_timedelta(timedeltas, units=None): + if units is None: + units = infer_timedelta_units(timedeltas) + + np_unit = _netcdf_to_numpy_timeunit(units) + num = 1.0 * timedeltas / np.timedelta64(1, np_unit) + num = np.where(pd.isnull(timedeltas), np.nan, num) + num = cast_to_int_if_safe(num) + return (num, units) + + +class CFDatetimeCoder(VariableCoder): + + def encode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_encoding(variable) + + if np.issubdtype(data.dtype, np.datetime64): + (data, units, calendar) = encode_cf_datetime( + data, + encoding.pop('units', None), + encoding.pop('calendar', None)) + safe_setitem(attrs, 'units', units, name=name) + safe_setitem(attrs, 'calendar', calendar, name=name) + + return Variable(dims, data, attrs, encoding) + + def decode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_decoding(variable) + + if 'units' in attrs and 'since' in attrs['units']: + units = pop_to(attrs, encoding, 'units') + calendar = pop_to(attrs, encoding, 'calendar') + dtype = _decode_cf_datetime_dtype(data, units, calendar) + transform = partial( + decode_cf_datetime, units=units, calendar=calendar) + data = lazy_elemwise_func(data, transform, dtype) + + return Variable(dims, data, attrs, encoding) + + +class CFTimedeltaCoder(VariableCoder): + + def encode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_encoding(variable) + + if np.issubdtype(data.dtype, np.timedelta64): + data, units = encode_cf_timedelta( + data, encoding.pop('units', None)) + safe_setitem(attrs, 'units', units, name=name) + + return Variable(dims, data, attrs, encoding) + + def decode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_decoding(variable) + + if 'units' in attrs and attrs['units'] in TIME_UNITS: + units = pop_to(attrs, encoding, 'units') + transform = partial(decode_cf_timedelta, units=units) + dtype = np.dtype('timedelta64[ns]') + data = lazy_elemwise_func(data, transform, dtype=dtype) + + return Variable(dims, data, attrs, encoding) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 993d93519b0..bf2ded8b562 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -32,6 +32,10 @@ class VariableCoder(object): the identity ``coder.decode(coder.encode(variable)) == variable``. If any options are necessary, they should be implemented as arguments to the __init__ method. + + The optional name argument to encode() and decode() exists solely for the + sake of better error messages, and should correspond to the name of + variables in the underlying store. """ def encode(self, variable, name=None): @@ -68,7 +72,7 @@ def __getitem__(self, key): def __repr__(self): return ("%s(%r, func=%r, dtype=%r)" % - (type(self).__name__, self.array, self._func, self._dtype)) + (type(self).__name__, self.array, self.func, self.dtype)) def lazy_elemwise_func(array, func, dtype): @@ -126,12 +130,14 @@ def pop_to(source, dest, key, name=None): def _apply_mask(data, # type: np.ndarray encoded_fill_values, # type: list - decoded_fill_value # type: Any - ): # type: npndarray + decoded_fill_value, # type: Any + dtype, # type: Any + ): # type: np.ndarray """Mask all matching values in a NumPy arrays.""" condition = False for fv in encoded_fill_values: condition |= data == fv + data = np.asarray(data, dtype=dtype) return np.where(condition, decoded_fill_value, data) @@ -145,11 +151,6 @@ def encode(self, variable, name=None): fill_value = pop_to(encoding, attrs, '_FillValue', name=name) if not pd.isnull(fill_value): data = duck_array_ops.fillna(data, fill_value) - variable = Variable(dims, data, attrs, encoding) - - if ('_FillValue' not in attrs and '_FillValue' not in encoding and - np.issubdtype(data.dtype, np.floating)): - attrs['_FillValue'] = data.dtype.type(np.nan) return Variable(dims, data, attrs, encoding) @@ -188,7 +189,89 @@ def decode(self, variable, name=None): if encoded_fill_values: transform = partial(_apply_mask, encoded_fill_values=encoded_fill_values, - decoded_fill_value=decoded_fill_value) + decoded_fill_value=decoded_fill_value, + dtype=dtype) data = lazy_elemwise_func(data, transform, dtype) return Variable(dims, data, attrs, encoding) + + +def _scale_offset_decoding(data, scale_factor, add_offset, dtype): + data = np.array(data, dtype=dtype, copy=True) + if scale_factor is not None: + data *= scale_factor + if add_offset is not None: + data += add_offset + return data + + +class CFScaleOffsetCoder(VariableCoder): + """Scale and offset variables according to CF conventions. + + Follows the formula: + decode_values = encoded_values * scale_factor + add_offset + """ + + def encode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_encoding(variable) + + if 'scale_factor' in encoding or 'add_offset' in encoding: + data = data.astype(dtype=np.float64, copy=True) + if 'add_offset' in encoding: + data -= pop_to(encoding, attrs, 'add_offset', name=name) + if 'scale_factor' in encoding: + data /= pop_to(encoding, attrs, 'scale_factor', name=name) + + return Variable(dims, data, attrs, encoding) + + def decode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_decoding(variable) + + if 'scale_factor' in attrs or 'add_offset' in attrs: + scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) + add_offset = pop_to(attrs, encoding, 'add_offset', name=name) + dtype = np.float64 + transform = partial(_scale_offset_decoding, + scale_factor=scale_factor, + add_offset=add_offset, + dtype=dtype) + data = lazy_elemwise_func(data, transform, dtype) + + return Variable(dims, data, attrs, encoding) + + +class UnsignedIntegerCoder(VariableCoder): + + def encode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_encoding(variable) + + if encoding.get('_Unsigned', False): + pop_to(encoding, attrs, '_Unsigned') + signed_dtype = np.dtype('i%s' % data.dtype.itemsize) + if '_FillValue' in attrs: + new_fill = signed_dtype.type(attrs['_FillValue']) + attrs['_FillValue'] = new_fill + data = duck_array_ops.around(data).astype(signed_dtype) + + return Variable(dims, data, attrs, encoding) + + def decode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_decoding(variable) + + if '_Unsigned' in attrs: + unsigned = pop_to(attrs, encoding, '_Unsigned') + + if data.dtype.kind == 'i': + if unsigned: + unsigned_dtype = np.dtype('u%s' % data.dtype.itemsize) + transform = partial(np.asarray, dtype=unsigned_dtype) + data = lazy_elemwise_func(data, transform, unsigned_dtype) + if '_FillValue' in attrs: + new_fill = unsigned_dtype.type(attrs['_FillValue']) + attrs['_FillValue'] = new_fill + else: + warnings.warn("variable %r has _Unsigned attribute but is not " + "of integer type. Ignoring attribute." % name, + SerializationWarning, stacklevel=3) + + return Variable(dims, data, attrs, encoding) diff --git a/xarray/conventions.py b/xarray/conventions.py index 5b951ff694b..ef80622b60f 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -1,455 +1,20 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from datetime import datetime -import re -import traceback -import warnings -import numpy as np -import pandas as pd +import warnings from collections import defaultdict -try: - from pandas.errors import OutOfBoundsDatetime -except ImportError: - # pandas < 0.20 - from pandas.tslib import OutOfBoundsDatetime - -from .core import duck_array_ops, indexing, ops, utils -from .core.formatting import format_timestamp, first_n_items, last_item -from .core.variable import as_variable, IndexVariable, Variable -from .core.pycompat import iteritems, OrderedDict, PY3, basestring - - -# standard calendars recognized by netcdftime -_STANDARD_CALENDARS = set(['standard', 'gregorian', 'proleptic_gregorian']) - -_NS_PER_TIME_DELTA = {'us': 1e3, - 'ms': 1e6, - 's': 1e9, - 'm': 1e9 * 60, - 'h': 1e9 * 60 * 60, - 'D': 1e9 * 60 * 60 * 24} - - -class SerializationWarning(RuntimeWarning): - """Warnings about encoding/decoding issues in serialization.""" - - -def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, - dtype=float): - """Scale and mask array values according to CF conventions for packed and - missing values - - First, values equal to the fill_value are replaced by NaN. Then, new values - are given by the formula: - - original_values * scale_factor + add_offset - - Parameters - ---------- - array : array-like - Original array of values to wrap - fill_value : number, optional - All values equal to fill_value in the original array are replaced - by NaN. If an array of multiple values is provided a warning will be - issued and all array elements matching an value in the fill_value array - will be replaced by NaN. - scale_factor : number, optional - Multiply entries in the original array by this number. - add_offset : number, optional - After applying scale_factor, add this number to entries in the - original array. - - Returns - ------- - scaled : np.ndarray - Array of masked and scaled values. - - References - ---------- - http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html - """ - # by default, cast to float to ensure NaN is meaningful - values = np.array(array, dtype=dtype, copy=True) - if fill_value is not None and not np.all(pd.isnull(fill_value)): - if getattr(fill_value, 'size', 1) > 1: - fill_values = fill_value # multiple fill values - else: - fill_values = [fill_value] - for f_value in fill_values: - if values.ndim > 0: - values[values == f_value] = np.nan - elif values == f_value: - values = np.array(np.nan) - if scale_factor is not None: - values *= scale_factor - if add_offset is not None: - values += add_offset - return values - - -def _netcdf_to_numpy_timeunit(units): - units = units.lower() - if not units.endswith('s'): - units = '%ss' % units - return {'microseconds': 'us', 'milliseconds': 'ms', 'seconds': 's', - 'minutes': 'm', 'hours': 'h', 'days': 'D'}[units] - - -def _unpack_netcdf_time_units(units): - # CF datetime units follow the format: "UNIT since DATE" - # this parses out the unit and date allowing for extraneous - # whitespace. - matches = re.match('(.+) since (.+)', units) - if not matches: - raise ValueError('invalid time units: %s' % units) - delta_units, ref_date = [s.strip() for s in matches.groups()] - return delta_units, ref_date - - -def _decode_datetime_with_netcdf4(num_dates, units, calendar): - import netCDF4 as nc4 - - dates = np.asarray(nc4.num2date(num_dates, units, calendar)) - if (dates[np.nanargmin(num_dates)].year < 1678 or - dates[np.nanargmax(num_dates)].year >= 2262): - warnings.warn('Unable to decode time axis into full ' - 'numpy.datetime64 objects, continuing using dummy ' - 'netCDF4.datetime objects instead, reason: dates out' - ' of range', SerializationWarning, stacklevel=3) - else: - try: - dates = nctime_to_nptime(dates) - except ValueError as e: - warnings.warn('Unable to decode time axis into full ' - 'numpy.datetime64 objects, continuing using ' - 'dummy netCDF4.datetime objects instead, reason:' - '{0}'.format(e), SerializationWarning, stacklevel=3) - return dates - - -def decode_cf_datetime(num_dates, units, calendar=None): - """Given an array of numeric dates in netCDF format, convert it into a - numpy array of date time objects. - - For standard (Gregorian) calendars, this function uses vectorized - operations, which makes it much faster than netCDF4.num2date. In such a - case, the returned array will be of type np.datetime64. - - Note that time unit in `units` must not be smaller than microseconds and - not larger than days. - - See also - -------- - netCDF4.num2date - """ - num_dates = np.asarray(num_dates) - flat_num_dates = num_dates.ravel() - if calendar is None: - calendar = 'standard' - - delta, ref_date = _unpack_netcdf_time_units(units) - - try: - if calendar not in _STANDARD_CALENDARS: - raise OutOfBoundsDatetime - - delta = _netcdf_to_numpy_timeunit(delta) - try: - ref_date = pd.Timestamp(ref_date) - except ValueError: - # ValueError is raised by pd.Timestamp for non-ISO timestamp - # strings, in which case we fall back to using netCDF4 - raise OutOfBoundsDatetime - - # fixes: https://github.com/pydata/pandas/issues/14068 - # these lines check if the the lowest or the highest value in dates - # cause an OutOfBoundsDatetime (Overflow) error - pd.to_timedelta(flat_num_dates.min(), delta) + ref_date - pd.to_timedelta(flat_num_dates.max(), delta) + ref_date - - # Cast input dates to integers of nanoseconds because `pd.to_datetime` - # works much faster when dealing with integers - flat_num_dates_ns_int = (flat_num_dates * - _NS_PER_TIME_DELTA[delta]).astype(np.int64) - - dates = (pd.to_timedelta(flat_num_dates_ns_int, 'ns') + - ref_date).values - - except (OutOfBoundsDatetime, OverflowError): - dates = _decode_datetime_with_netcdf4(flat_num_dates.astype(np.float), - units, - calendar) - - return dates.reshape(num_dates.shape) - - -def decode_cf_timedelta(num_timedeltas, units): - """Given an array of numeric timedeltas in netCDF format, convert it into a - numpy timedelta64[ns] array. - """ - num_timedeltas = np.asarray(num_timedeltas) - units = _netcdf_to_numpy_timeunit(units) - shape = num_timedeltas.shape - num_timedeltas = num_timedeltas.ravel() - - result = pd.to_timedelta(num_timedeltas, unit=units, box=False) - # NaT is returned unboxed with wrong units; this should be fixed in pandas - if result.dtype != 'timedelta64[ns]': - result = result.astype('timedelta64[ns]') - return result.reshape(shape) - - -TIME_UNITS = frozenset(['days', 'hours', 'minutes', 'seconds', - 'milliseconds', 'microseconds']) - - -def _infer_time_units_from_diff(unique_timedeltas): - for time_unit, delta in [('days', 86400), ('hours', 3600), - ('minutes', 60), ('seconds', 1)]: - unit_delta = np.timedelta64(10 ** 9 * delta, 'ns') - diffs = unique_timedeltas / unit_delta - if np.all(diffs == diffs.astype(int)): - return time_unit - return 'seconds' - - -def infer_datetime_units(dates): - """Given an array of datetimes, returns a CF compatible time-unit string of - the form "{time_unit} since {date[0]}", where `time_unit` is 'days', - 'hours', 'minutes' or 'seconds' (the first one that can evenly divide all - unique time deltas in `dates`) - """ - dates = pd.to_datetime(np.asarray(dates).ravel(), box=False) - dates = dates[pd.notnull(dates)] - unique_timedeltas = np.unique(np.diff(dates)) - units = _infer_time_units_from_diff(unique_timedeltas) - reference_date = dates[0] if len(dates) > 0 else '1970-01-01' - return '%s since %s' % (units, pd.Timestamp(reference_date)) - - -def infer_timedelta_units(deltas): - """Given an array of timedeltas, returns a CF compatible time-unit from - {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly - divide all unique time deltas in `deltas`) - """ - deltas = pd.to_timedelta(np.asarray(deltas).ravel(), box=False) - unique_timedeltas = np.unique(deltas[pd.notnull(deltas)]) - units = _infer_time_units_from_diff(unique_timedeltas) - return units - - -def nctime_to_nptime(times): - """Given an array of netCDF4.datetime objects, return an array of - numpy.datetime64 objects of the same size""" - times = np.asarray(times) - new = np.empty(times.shape, dtype='M8[ns]') - for i, t in np.ndenumerate(times): - dt = datetime(t.year, t.month, t.day, t.hour, t.minute, t.second) - new[i] = np.datetime64(dt) - return new - - -def _cleanup_netcdf_time_units(units): - delta, ref_date = _unpack_netcdf_time_units(units) - try: - units = '%s since %s' % (delta, format_timestamp(ref_date)) - except OutOfBoundsDatetime: - # don't worry about reifying the units if they're out of bounds - pass - return units - - -def _encode_datetime_with_netcdf4(dates, units, calendar): - """Fallback method for encoding dates using netCDF4-python. - - This method is more flexible than xarray's parsing using datetime64[ns] - arrays but also slower because it loops over each element. - """ - import netCDF4 as nc4 - - if np.issubdtype(dates.dtype, np.datetime64): - # numpy's broken datetime conversion only works for us precision - dates = dates.astype('M8[us]').astype(datetime) - - def encode_datetime(d): - return np.nan if d is None else nc4.date2num(d, units, calendar) - - return np.vectorize(encode_datetime)(dates) - - -def cast_to_int_if_safe(num): - int_num = np.array(num, dtype=np.int64) - if (num == int_num).all(): - num = int_num - return num - - -def encode_cf_datetime(dates, units=None, calendar=None): - """Given an array of datetime objects, returns the tuple `(num, units, - calendar)` suitable for a CF compliant time variable. - - Unlike `date2num`, this function can handle datetime64 arrays. - - See also - -------- - netCDF4.date2num - """ - dates = np.asarray(dates) - - if units is None: - units = infer_datetime_units(dates) - else: - units = _cleanup_netcdf_time_units(units) - - if calendar is None: - calendar = 'proleptic_gregorian' - - delta, ref_date = _unpack_netcdf_time_units(units) - try: - if calendar not in _STANDARD_CALENDARS or dates.dtype.kind == 'O': - # parse with netCDF4 instead - raise OutOfBoundsDatetime - assert dates.dtype == 'datetime64[ns]' - - delta_units = _netcdf_to_numpy_timeunit(delta) - time_delta = np.timedelta64(1, delta_units).astype('timedelta64[ns]') - ref_date = np.datetime64(pd.Timestamp(ref_date)) - num = (dates - ref_date) / time_delta - - except (OutOfBoundsDatetime, OverflowError): - num = _encode_datetime_with_netcdf4(dates, units, calendar) - - num = cast_to_int_if_safe(num) - return (num, units, calendar) - - -def encode_cf_timedelta(timedeltas, units=None): - if units is None: - units = infer_timedelta_units(timedeltas) - - np_unit = _netcdf_to_numpy_timeunit(units) - num = 1.0 * timedeltas / np.timedelta64(1, np_unit) - num = np.where(pd.isnull(timedeltas), np.nan, num) - num = cast_to_int_if_safe(num) - return (num, units) - - -class MaskedAndScaledArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Wrapper around array-like objects to create a new indexable object where - values, when accessed, are automatically scaled and masked according to - CF conventions for packed and missing data values. - - New values are given by the formula: - original_values * scale_factor + add_offset - - Values can only be accessed via `__getitem__`: - - >>> x = MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1) - >>> x - MaskedAndScaledArray(array([-99, -1, 0, 1, 2]), fill_value=-99, - scale_factor=0.01, add_offset=1) - >>> x[:] - array([ nan, 0.99, 1. , 1.01, 1.02] - - References - ---------- - http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html - """ - def __init__(self, array, fill_value=None, scale_factor=None, - add_offset=None, dtype=float): - """ - Parameters - ---------- - array : array-like - Original array of values to wrap - fill_value : number, optional - All values equal to fill_value in the original array are replaced - by NaN. - scale_factor : number, optional - Multiply entries in the original array by this number. - add_offset : number, optional - After applying scale_factor, add this number to entries in the - original array. - """ - self.array = indexing.as_indexable(array) - self.fill_value = fill_value - self.scale_factor = scale_factor - self.add_offset = add_offset - self._dtype = dtype - - @property - def dtype(self): - return np.dtype(self._dtype) - - def __getitem__(self, key): - return mask_and_scale(self.array[key], self.fill_value, - self.scale_factor, self.add_offset, self._dtype) - - def __repr__(self): - return ("%s(%r, fill_value=%r, scale_factor=%r, add_offset=%r, " - "dtype=%r)" % - (type(self).__name__, self.array, self.fill_value, - self.scale_factor, self.add_offset, self._dtype)) - - -class DecodedCFDatetimeArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Wrapper around array-like objects to create a new indexable object where - values, when accessed, are automatically converted into datetime objects - using decode_cf_datetime. - """ - def __init__(self, array, units, calendar=None): - self.array = indexing.as_indexable(array) - self.units = units - self.calendar = calendar - - # Verify that at least the first and last date can be decoded - # successfully. Otherwise, tracebacks end up swallowed by - # Dataset.__repr__ when users try to view their lazily decoded array. - values = indexing.ImplicitToExplicitIndexingAdapter(self.array) - example_value = np.concatenate([first_n_items(values, 1) or [0], - last_item(values) or [0]]) - - try: - result = decode_cf_datetime(example_value, units, calendar) - except Exception: - calendar_msg = ('the default calendar' if calendar is None - else 'calendar %r' % calendar) - msg = ('unable to decode time units %r with %s. Try ' - 'opening your dataset with decode_times=False.' - % (units, calendar_msg)) - if not PY3: - msg += ' Full traceback:\n' + traceback.format_exc() - raise ValueError(msg) - else: - self._dtype = getattr(result, 'dtype', np.dtype('object')) - - @property - def dtype(self): - return self._dtype - - def __getitem__(self, key): - return decode_cf_datetime(self.array[key], units=self.units, - calendar=self.calendar) - - -class DecodedCFTimedeltaArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Wrapper around array-like objects to create a new indexable object where - values, when accessed, are automatically converted into timedelta objects - using decode_cf_timedelta. - """ - def __init__(self, array, units): - self.array = indexing.as_indexable(array) - self.units = units +import numpy as np - @property - def dtype(self): - return np.dtype('timedelta64[ns]') +import pandas as pd - def __getitem__(self, key): - return decode_cf_timedelta(self.array[key], units=self.units) +from .coding import times +from .coding import variables +from .coding.variables import SerializationWarning +from .core import duck_array_ops, indexing +from .core.pycompat import OrderedDict, basestring, iteritems +from .core.variable import IndexVariable, Variable, as_variable class StackedBytesArray(indexing.ExplicitlyIndexedNDArrayMixin): @@ -594,34 +159,6 @@ def __getitem__(self, key): return np.asarray(self.array[key], dtype=self.dtype) -class UnsignedIntTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from signed integer to unsigned - integer. Typically used when _Unsigned is set at as a netCDF - attribute on a signed integer variable. - - >>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') - - >>> sb.dtype - dtype('int8') - - >>> UnsignedIntTypeArray(sb).dtype - dtype('uint8') - - >>> UnsignedIntTypeArray(sb)[:] - array([ 0, 1, 127, 128, 255], dtype=uint8) - """ - def __init__(self, array): - self.array = indexing.as_indexable(array) - self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize) - - @property - def dtype(self): - return self.unsigned_dtype - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - def bytes_to_char(arr): """Like netCDF4.stringtochar, but faster and more flexible. """ @@ -696,56 +233,17 @@ def _var_as_tuple(var): return var.dims, var.data, var.attrs.copy(), var.encoding.copy() -def maybe_encode_datetime(var, name=None): - if np.issubdtype(var.dtype, np.datetime64): - dims, data, attrs, encoding = _var_as_tuple(var) - (data, units, calendar) = encode_cf_datetime( - data, encoding.pop('units', None), encoding.pop('calendar', None)) - safe_setitem(attrs, 'units', units, name=name) - safe_setitem(attrs, 'calendar', calendar, name=name) - var = Variable(dims, data, attrs, encoding) - return var - - -def maybe_encode_timedelta(var, name=None): - if np.issubdtype(var.dtype, np.timedelta64): - dims, data, attrs, encoding = _var_as_tuple(var) - data, units = encode_cf_timedelta( - data, encoding.pop('units', None)) - safe_setitem(attrs, 'units', units, name=name) - var = Variable(dims, data, attrs, encoding) - return var - - -def maybe_encode_offset_and_scale(var, needs_copy=True, name=None): - if any(k in var.encoding for k in ['add_offset', 'scale_factor']): - dims, data, attrs, encoding = _var_as_tuple(var) - data = data.astype(dtype=float, copy=needs_copy) - needs_copy = False - if 'add_offset' in encoding: - data -= pop_to(encoding, attrs, 'add_offset', name=name) - if 'scale_factor' in encoding: - data /= pop_to(encoding, attrs, 'scale_factor', name=name) - var = Variable(dims, data, attrs, encoding) - return var, needs_copy - - -def maybe_encode_fill_value(var, needs_copy=True, name=None): - # replace NaN with the fill value - if var.encoding.get('_FillValue') is not None: - dims, data, attrs, encoding = _var_as_tuple(var) - fill_value = pop_to(encoding, attrs, '_FillValue', name=name) - if not pd.isnull(fill_value): - data = ops.fillna(data, fill_value) - needs_copy = False - var = Variable(dims, data, attrs, encoding) - return var, needs_copy - - def maybe_encode_as_char_array(var, name=None): if var.dtype.kind in {'S', 'U'}: dims, data, attrs, encoding = _var_as_tuple(var) if data.dtype.kind == 'U': + if '_FillValue' in attrs: + raise NotImplementedError( + 'variable {!r} has a _FillValue specified, but ' + '_FillValue is yet supported on unicode strings: ' + 'https://github.com/pydata/xarray/issues/1647' + .format(name)) + string_encoding = encoding.pop('_Encoding', 'utf-8') safe_setitem(attrs, '_Encoding', string_encoding, name=name) data = encode_string_array(data, string_encoding) @@ -780,13 +278,6 @@ def maybe_encode_nonstring_dtype(var, name=None): 'any _FillValue to use for NaNs' % name, SerializationWarning, stacklevel=3) data = duck_array_ops.around(data)[...] - if encoding.get('_Unsigned', False): - signed_dtype = np.dtype('i%s' % dtype.itemsize) - if '_FillValue' in var.attrs: - new_fill = signed_dtype.type(attrs['_FillValue']) - attrs['_FillValue'] = new_fill - data = data.astype(signed_dtype) - pop_to(encoding, attrs, '_Unsigned') data = data.astype(dtype=dtype) var = Variable(dims, data, attrs, encoding) return var @@ -830,18 +321,20 @@ def _infer_dtype(array, name=None): return dtype +def ensure_not_multiindex(var, name=None): + if (isinstance(var, IndexVariable) and + isinstance(var.to_index(), pd.MultiIndex)): + raise NotImplementedError( + 'variable {!r} is a MultiIndex, which cannot yet be ' + 'serialized to netCDF files ' + '(https://github.com/pydata/xarray/issues/1077). Use ' + 'reset_index() to convert MultiIndex levels into coordinate ' + 'variables instead.'.format(name)) + + def ensure_dtype_not_object(var, name=None): # TODO: move this from conventions to backends? (it's not CF related) if var.dtype.kind == 'O': - if (isinstance(var, IndexVariable) and - isinstance(var.to_index(), pd.MultiIndex)): - raise NotImplementedError( - 'variable {!r} is a MultiIndex, which cannot yet be ' - 'serialized to netCDF files ' - '(https://github.com/pydata/xarray/issues/1077). Use ' - 'reset_index() to convert MultiIndex levels into coordinate ' - 'variables instead.'.format(name)) - dims, data, attrs, encoding = _var_as_tuple(var) missing = pd.isnull(data) if missing.any(): @@ -890,10 +383,16 @@ def encode_cf_variable(var, needs_copy=True, name=None): out : xarray.Variable A variable which has been encoded as described above. """ - var = maybe_encode_datetime(var, name=name) - var = maybe_encode_timedelta(var, name=name) - var, needs_copy = maybe_encode_offset_and_scale(var, needs_copy, name=name) - var, needs_copy = maybe_encode_fill_value(var, needs_copy, name=name) + ensure_not_multiindex(var, name=name) + + for coder in [times.CFDatetimeCoder(), + times.CFTimedeltaCoder(), + variables.CFScaleOffsetCoder(), + variables.CFMaskCoder(), + variables.UnsignedIntegerCoder()]: + var = coder.encode(var, name=name) + + # TODO(shoyer): convert all of these to use coders, too: var = maybe_encode_nonstring_dtype(var, name=name) var = maybe_default_fill_value(var) var = maybe_encode_bools(var) @@ -958,73 +457,22 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, if string_encoding is not None: data = BytesToStringArray(data, string_encoding) - unsigned = pop_to(attributes, encoding, '_Unsigned') - if unsigned and mask_and_scale: - if data.dtype.kind == 'i': - data = UnsignedIntTypeArray(data) - else: - warnings.warn("variable %r has _Unsigned attribute but is not " - "of integer type. Ignoring attribute." % name, - SerializationWarning, stacklevel=3) + # TODO(shoyer): convert everything above to use coders + var = Variable(dimensions, data, attributes, encoding) if mask_and_scale: - if 'missing_value' in attributes: - # missing_value is deprecated, but we still want to support it as - # an alias for _FillValue. - if ('_FillValue' in attributes and - not utils.equivalent(attributes['_FillValue'], - attributes['missing_value'])): - raise ValueError("Conflicting _FillValue and missing_value " - "attributes on a variable {!r}: {} vs. {}\n\n" - "Consider opening the offending dataset " - "using decode_cf=False, correcting the " - "attributes and decoding explicitly using " - "xarray.decode_cf()." - .format(name, attributes['_FillValue'], - attributes['missing_value'])) - attributes['_FillValue'] = attributes.pop('missing_value') - - fill_value = pop_to(attributes, encoding, '_FillValue') - if isinstance(fill_value, np.ndarray) and fill_value.size > 1: - warnings.warn("variable {!r} has multiple fill values {}, " - "decoding all values to NaN." - .format(name, fill_value), - SerializationWarning, stacklevel=3) - - scale_factor = pop_to(attributes, encoding, 'scale_factor') - add_offset = pop_to(attributes, encoding, 'add_offset') - has_fill = (fill_value is not None and - not np.any(pd.isnull(fill_value))) - if (has_fill or scale_factor is not None or add_offset is not None): - if has_fill and np.array(fill_value).dtype.kind in ['U', 'S', 'O']: - if string_encoding is not None: - raise NotImplementedError( - 'variable %r has a _FillValue specified, but ' - '_FillValue is yet supported on unicode strings: ' - 'https://github.com/pydata/xarray/issues/1647') - dtype = object - else: - # According to the CF spec, the fill value is of the same - # type as its variable, i.e. its storage format on disk. - # This handles the case where the fill_value also needs to be - # converted to its unsigned value. - if has_fill: - fill_value = data.dtype.type(fill_value) - dtype = float - - data = MaskedAndScaledArray(data, fill_value, scale_factor, - add_offset, dtype) - - if decode_times and 'units' in attributes: - if 'since' in attributes['units']: - # datetime - units = pop_to(attributes, encoding, 'units') - calendar = pop_to(attributes, encoding, 'calendar') - data = DecodedCFDatetimeArray(data, units, calendar) - elif attributes['units'] in TIME_UNITS: - # timedelta - units = pop_to(attributes, encoding, 'units') - data = DecodedCFTimedeltaArray(data, units) + for coder in [variables.UnsignedIntegerCoder(), + variables.CFMaskCoder(), + variables.CFScaleOffsetCoder()]: + var = coder.decode(var, name=name) + if decode_times: + for coder in [times.CFTimedeltaCoder(), + times.CFDatetimeCoder()]: + var = coder.decode(var, name=name) + + dimensions, data, attributes, encoding = ( + variables.unpack_for_decoding(var)) + # TODO(shoyer): convert everything below to use coders if decode_endianness and not data.dtype.isnative: # do this last, so it's only done if we didn't already unmask/scale diff --git a/xarray/convert.py b/xarray/convert.py index 446bd5a0d35..caf665b421d 100644 --- a/xarray/convert.py +++ b/xarray/convert.py @@ -6,11 +6,11 @@ import numpy as np +from .coding.times import CFDatetimeCoder, CFTimedeltaCoder from .core.dataarray import DataArray from .core.pycompat import OrderedDict, range from .core.dtypes import get_fill_value -from .conventions import ( - maybe_encode_timedelta, maybe_encode_datetime, decode_cf) +from .conventions import decode_cf cdms2_ignored_attrs = {'name', 'tileIndex'} iris_forbidden_keys = {'standard_name', 'long_name', 'units', 'bounds', 'axis', @@ -25,7 +25,7 @@ def encode(var): - return maybe_encode_timedelta(maybe_encode_datetime(var.variable)) + return CFTimedeltaCoder().encode(CFDatetimeCoder().encode(var.variable)) def _filter_attrs(attrs, ignored_attrs): diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py new file mode 100644 index 00000000000..f4c726355a5 --- /dev/null +++ b/xarray/tests/test_coding_times.py @@ -0,0 +1,323 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import warnings + +import numpy as np +import pandas as pd + +from xarray import Variable, coding +from . import TestCase, requires_netCDF4 + + +@np.vectorize +def _ensure_naive_tz(dt): + if hasattr(dt, 'tzinfo'): + return dt.replace(tzinfo=None) + else: + return dt + + +class TestDatetime(TestCase): + @requires_netCDF4 + def test_cf_datetime(self): + import netCDF4 as nc4 + for num_dates, units in [ + (np.arange(10), 'days since 2000-01-01'), + (np.arange(10).reshape(2, 5), 'days since 2000-01-01'), + (12300 + np.arange(5), 'hours since 1680-01-01 00:00:00'), + # here we add a couple minor formatting errors to test + # the robustness of the parsing algorithm. + (12300 + np.arange(5), 'hour since 1680-01-01 00:00:00'), + (12300 + np.arange(5), u'Hour since 1680-01-01 00:00:00'), + (12300 + np.arange(5), ' Hour since 1680-01-01 00:00:00 '), + (10, 'days since 2000-01-01'), + ([10], 'daYs since 2000-01-01'), + ([[10]], 'days since 2000-01-01'), + ([10, 10], 'days since 2000-01-01'), + (np.array(10), 'days since 2000-01-01'), + (0, 'days since 1000-01-01'), + ([0], 'days since 1000-01-01'), + ([[0]], 'days since 1000-01-01'), + (np.arange(2), 'days since 1000-01-01'), + (np.arange(0, 100000, 20000), 'days since 1900-01-01'), + (17093352.0, 'hours since 1-1-1 00:00:0.0'), + ([0.5, 1.5], 'hours since 1900-01-01T00:00:00'), + (0, 'milliseconds since 2000-01-01T00:00:00'), + (0, 'microseconds since 2000-01-01T00:00:00'), + ]: + for calendar in ['standard', 'gregorian', 'proleptic_gregorian']: + expected = _ensure_naive_tz( + nc4.num2date(num_dates, units, calendar)) + print(num_dates, units, calendar) + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', + 'Unable to decode time axis') + actual = coding.times.decode_cf_datetime(num_dates, units, + calendar) + if (isinstance(actual, np.ndarray) and + np.issubdtype(actual.dtype, np.datetime64)): + # self.assertEqual(actual.dtype.kind, 'M') + # For some reason, numpy 1.8 does not compare ns precision + # datetime64 arrays as equal to arrays of datetime objects, + # but it works for us precision. Thus, convert to us + # precision for the actual array equal comparison... + actual_cmp = actual.astype('M8[us]') + else: + actual_cmp = actual + self.assertArrayEqual(expected, actual_cmp) + encoded, _, _ = coding.times.encode_cf_datetime(actual, units, + calendar) + if '1-1-1' not in units: + # pandas parses this date very strangely, so the original + # units/encoding cannot be preserved in this case: + # (Pdb) pd.to_datetime('1-1-1 00:00:0.0') + # Timestamp('2001-01-01 00:00:00') + self.assertArrayEqual(num_dates, np.around(encoded, 1)) + if (hasattr(num_dates, 'ndim') and num_dates.ndim == 1 and + '1000' not in units): + # verify that wrapping with a pandas.Index works + # note that it *does not* currently work to even put + # non-datetime64 compatible dates into a pandas.Index + encoded, _, _ = coding.times.encode_cf_datetime( + pd.Index(actual), units, calendar) + self.assertArrayEqual(num_dates, np.around(encoded, 1)) + + @requires_netCDF4 + def test_decode_cf_datetime_overflow(self): + # checks for + # https://github.com/pydata/pandas/issues/14068 + # https://github.com/pydata/xarray/issues/975 + + from datetime import datetime + units = 'days since 2000-01-01 00:00:00' + + # date after 2262 and before 1678 + days = (-117608, 95795) + expected = (datetime(1677, 12, 31), datetime(2262, 4, 12)) + + for i, day in enumerate(days): + result = coding.times.decode_cf_datetime(day, units) + self.assertEqual(result, expected[i]) + + def test_decode_cf_datetime_non_standard_units(self): + expected = pd.date_range(periods=100, start='1970-01-01', freq='h') + # netCDFs from madis.noaa.gov use this format for their time units + # they cannot be parsed by netcdftime, but pd.Timestamp works + units = 'hours since 1-1-1970' + actual = coding.times.decode_cf_datetime(np.arange(100), units) + self.assertArrayEqual(actual, expected) + + @requires_netCDF4 + def test_decode_cf_datetime_non_iso_strings(self): + # datetime strings that are _almost_ ISO compliant but not quite, + # but which netCDF4.num2date can still parse correctly + expected = pd.date_range(periods=100, start='2000-01-01', freq='h') + cases = [(np.arange(100), 'hours since 2000-01-01 0'), + (np.arange(100), 'hours since 2000-1-1 0'), + (np.arange(100), 'hours since 2000-01-01 0:00')] + for num_dates, units in cases: + actual = coding.times.decode_cf_datetime(num_dates, units) + self.assertArrayEqual(actual, expected) + + @requires_netCDF4 + def test_decode_non_standard_calendar(self): + import netCDF4 as nc4 + + for calendar in ['noleap', '365_day', '360_day', 'julian', 'all_leap', + '366_day']: + units = 'days since 0001-01-01' + times = pd.date_range('2001-04-01-00', end='2001-04-30-23', + freq='H') + noleap_time = nc4.date2num(times.to_pydatetime(), units, + calendar=calendar) + expected = times.values + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'Unable to decode time axis') + actual = coding.times.decode_cf_datetime(noleap_time, units, + calendar=calendar) + self.assertEqual(actual.dtype, np.dtype('M8[ns]')) + abs_diff = abs(actual - expected) + # once we no longer support versions of netCDF4 older than 1.1.5, + # we could do this check with near microsecond accuracy: + # https://github.com/Unidata/netcdf4-python/issues/355 + self.assertTrue((abs_diff <= np.timedelta64(1, 's')).all()) + + @requires_netCDF4 + def test_decode_non_standard_calendar_single_element(self): + units = 'days since 0001-01-01' + for calendar in ['noleap', '365_day', '360_day', 'julian', 'all_leap', + '366_day']: + for num_time in [735368, [735368], [[735368]]]: + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', + 'Unable to decode time axis') + actual = coding.times.decode_cf_datetime(num_time, units, + calendar=calendar) + self.assertEqual(actual.dtype, np.dtype('M8[ns]')) + + @requires_netCDF4 + def test_decode_non_standard_calendar_single_element_fallback(self): + import netCDF4 as nc4 + + units = 'days since 0001-01-01' + dt = nc4.netcdftime.datetime(2001, 2, 29) + for calendar in ['360_day', 'all_leap', '366_day']: + num_time = nc4.date2num(dt, units, calendar) + with self.assertWarns('Unable to decode time axis'): + actual = coding.times.decode_cf_datetime(num_time, units, + calendar=calendar) + expected = np.asarray(nc4.num2date(num_time, units, calendar)) + print(num_time, calendar, actual, expected) + self.assertEqual(actual.dtype, np.dtype('O')) + self.assertEqual(expected, actual) + + @requires_netCDF4 + def test_decode_non_standard_calendar_multidim_time(self): + import netCDF4 as nc4 + + calendar = 'noleap' + units = 'days since 0001-01-01' + times1 = pd.date_range('2001-04-01', end='2001-04-05', freq='D') + times2 = pd.date_range('2001-05-01', end='2001-05-05', freq='D') + noleap_time1 = nc4.date2num(times1.to_pydatetime(), units, + calendar=calendar) + noleap_time2 = nc4.date2num(times2.to_pydatetime(), units, + calendar=calendar) + mdim_time = np.empty((len(noleap_time1), 2), ) + mdim_time[:, 0] = noleap_time1 + mdim_time[:, 1] = noleap_time2 + + expected1 = times1.values + expected2 = times2.values + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'Unable to decode time axis') + actual = coding.times.decode_cf_datetime(mdim_time, units, + calendar=calendar) + self.assertEqual(actual.dtype, np.dtype('M8[ns]')) + self.assertArrayEqual(actual[:, 0], expected1) + self.assertArrayEqual(actual[:, 1], expected2) + + @requires_netCDF4 + def test_decode_non_standard_calendar_fallback(self): + import netCDF4 as nc4 + # ensure leap year doesn't matter + for year in [2010, 2011, 2012, 2013, 2014]: + for calendar in ['360_day', '366_day', 'all_leap']: + calendar = '360_day' + units = 'days since {0}-01-01'.format(year) + num_times = np.arange(100) + expected = nc4.num2date(num_times, units, calendar) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + actual = coding.times.decode_cf_datetime(num_times, units, + calendar=calendar) + self.assertEqual(len(w), 1) + self.assertIn('Unable to decode time axis', + str(w[0].message)) + + self.assertEqual(actual.dtype, np.dtype('O')) + self.assertArrayEqual(actual, expected) + + @requires_netCDF4 + def test_cf_datetime_nan(self): + for num_dates, units, expected_list in [ + ([np.nan], 'days since 2000-01-01', ['NaT']), + ([np.nan, 0], 'days since 2000-01-01', + ['NaT', '2000-01-01T00:00:00Z']), + ([np.nan, 0, 1], 'days since 2000-01-01', + ['NaT', '2000-01-01T00:00:00Z', '2000-01-02T00:00:00Z']), + ]: + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'All-NaN') + actual = coding.times.decode_cf_datetime(num_dates, units) + expected = np.array(expected_list, dtype='datetime64[ns]') + self.assertArrayEqual(expected, actual) + + @requires_netCDF4 + def test_decoded_cf_datetime_array_2d(self): + # regression test for GH1229 + variable = Variable(('x', 'y'), np.array([[0, 1], [2, 3]]), + {'units': 'days since 2000-01-01'}) + result = coding.times.CFDatetimeCoder().decode(variable) + assert result.dtype == 'datetime64[ns]' + expected = pd.date_range('2000-01-01', periods=4).values.reshape(2, 2) + self.assertArrayEqual(np.asarray(result), expected) + + def test_infer_datetime_units(self): + for dates, expected in [(pd.date_range('1900-01-01', periods=5), + 'days since 1900-01-01 00:00:00'), + (pd.date_range('1900-01-01 12:00:00', freq='H', + periods=2), + 'hours since 1900-01-01 12:00:00'), + (['1900-01-01', '1900-01-02', + '1900-01-02 00:00:01'], + 'seconds since 1900-01-01 00:00:00'), + (pd.to_datetime( + ['1900-01-01', '1900-01-02', 'NaT']), + 'days since 1900-01-01 00:00:00'), + (pd.to_datetime(['1900-01-01', + '1900-01-02T00:00:00.005']), + 'seconds since 1900-01-01 00:00:00'), + (pd.to_datetime(['NaT', '1900-01-01']), + 'days since 1900-01-01 00:00:00'), + (pd.to_datetime(['NaT']), + 'days since 1970-01-01 00:00:00'), + ]: + self.assertEqual( + expected, coding.times.infer_datetime_units(dates)) + + def test_cf_timedelta(self): + examples = [ + ('1D', 'days', np.int64(1)), + (['1D', '2D', '3D'], 'days', np.array([1, 2, 3], 'int64')), + ('1h', 'hours', np.int64(1)), + ('1ms', 'milliseconds', np.int64(1)), + ('1us', 'microseconds', np.int64(1)), + (['NaT', '0s', '1s'], None, [np.nan, 0, 1]), + (['30m', '60m'], 'hours', [0.5, 1.0]), + (np.timedelta64('NaT', 'ns'), 'days', np.nan), + (['NaT', 'NaT'], 'days', [np.nan, np.nan]), + ] + + for timedeltas, units, numbers in examples: + timedeltas = pd.to_timedelta(timedeltas, box=False) + numbers = np.array(numbers) + + expected = numbers + actual, _ = coding.times.encode_cf_timedelta(timedeltas, units) + self.assertArrayEqual(expected, actual) + self.assertEqual(expected.dtype, actual.dtype) + + if units is not None: + expected = timedeltas + actual = coding.times.decode_cf_timedelta(numbers, units) + self.assertArrayEqual(expected, actual) + self.assertEqual(expected.dtype, actual.dtype) + + expected = np.timedelta64('NaT', 'ns') + actual = coding.times.decode_cf_timedelta(np.array(np.nan), 'days') + self.assertArrayEqual(expected, actual) + + def test_cf_timedelta_2d(self): + timedeltas = ['1D', '2D', '3D'] + units = 'days' + numbers = np.atleast_2d([1, 2, 3]) + + timedeltas = np.atleast_2d(pd.to_timedelta(timedeltas, box=False)) + expected = timedeltas + + actual = coding.times.decode_cf_timedelta(numbers, units) + self.assertArrayEqual(expected, actual) + self.assertEqual(expected.dtype, actual.dtype) + + def test_infer_timedelta_units(self): + for deltas, expected in [ + (pd.to_timedelta(['1 day', '2 days']), 'days'), + (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'), + (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'), + (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]: + self.assertEqual( + expected, coding.times.infer_timedelta_units(deltas)) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index ca88ea661c7..0d0d1efd598 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -2,16 +2,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function + import contextlib +import warnings import numpy as np import pandas as pd -import pytest -import warnings - import pytest from xarray import conventions, Variable, Dataset, open_dataset from xarray.core import utils, indexing +from xarray.testing import assert_identical from . import TestCase, requires_netCDF4, unittest, raises_regex, IndexerMaker from .test_backends import CFEncodedDataTest from xarray.core.pycompat import iteritems @@ -25,46 +25,6 @@ V = IndexerMaker(indexing.VectorizedIndexer) -class TestMaskedAndScaledArray(TestCase): - def test(self): - x = conventions.MaskedAndScaledArray(np.arange(3), fill_value=0) - self.assertEqual(x.dtype, np.dtype('float')) - self.assertEqual(x.shape, (3,)) - self.assertEqual(x.size, 3) - self.assertEqual(x.ndim, 1) - self.assertEqual(len(x), 3) - self.assertArrayEqual([np.nan, 1, 2], x) - - x = conventions.MaskedAndScaledArray(np.arange(3), add_offset=1) - self.assertArrayEqual(np.arange(3) + 1, x) - - x = conventions.MaskedAndScaledArray(np.arange(3), scale_factor=2) - self.assertArrayEqual(2 * np.arange(3), x) - - x = conventions.MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), - -99, 0.01, 1) - expected = np.array([np.nan, 0.99, 1, 1.01, 1.02]) - self.assertArrayEqual(expected, x) - - def test_0d(self): - x = conventions.MaskedAndScaledArray(np.array(0), fill_value=0) - self.assertTrue(np.isnan(x)) - self.assertTrue(np.isnan(x[B[()]])) - - x = conventions.MaskedAndScaledArray(np.array(0), fill_value=10) - self.assertEqual(0, x[B[()]]) - - def test_multiple_fill_value(self): - x = conventions.MaskedAndScaledArray( - np.arange(4), fill_value=np.array([0, 1])) - self.assertArrayEqual([np.nan, np.nan, 2, 3], x) - - x = conventions.MaskedAndScaledArray( - np.array(0), fill_value=np.array([0, 1])) - self.assertTrue(np.isnan(x)) - self.assertTrue(np.isnan(x[B[()]])) - - class TestStackedBytesArray(TestCase): def test_wrapper_class(self): array = np.array([[b'a', b'b', b'c'], [b'd', b'e', b'f']], dtype='S') @@ -176,15 +136,6 @@ def test_decode_bytes_array(self): np.testing.assert_array_equal(actual, expected) -class TestUnsignedIntTypeArray(TestCase): - def test_unsignedinttype_array(self): - sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') - ub = conventions.UnsignedIntTypeArray(sb) - self.assertEqual(ub.dtype, np.dtype('u1')) - self.assertArrayEqual(ub, np.array([0, 1, 127, 128, 255], - dtype=np.dtype('u1'))) - - class TestBoolTypeArray(TestCase): def test_booltype_array(self): x = np.array([1, 0, 1, 1, 0], dtype='i1') @@ -194,397 +145,6 @@ def test_booltype_array(self): dtype=np.bool)) -@np.vectorize -def _ensure_naive_tz(dt): - if hasattr(dt, 'tzinfo'): - return dt.replace(tzinfo=None) - else: - return dt - - -class TestDatetime(TestCase): - @requires_netCDF4 - def test_cf_datetime(self): - import netCDF4 as nc4 - for num_dates, units in [ - (np.arange(10), 'days since 2000-01-01'), - (np.arange(10).reshape(2, 5), 'days since 2000-01-01'), - (12300 + np.arange(5), 'hours since 1680-01-01 00:00:00'), - # here we add a couple minor formatting errors to test - # the robustness of the parsing algorithm. - (12300 + np.arange(5), 'hour since 1680-01-01 00:00:00'), - (12300 + np.arange(5), u'Hour since 1680-01-01 00:00:00'), - (12300 + np.arange(5), ' Hour since 1680-01-01 00:00:00 '), - (10, 'days since 2000-01-01'), - ([10], 'daYs since 2000-01-01'), - ([[10]], 'days since 2000-01-01'), - ([10, 10], 'days since 2000-01-01'), - (np.array(10), 'days since 2000-01-01'), - (0, 'days since 1000-01-01'), - ([0], 'days since 1000-01-01'), - ([[0]], 'days since 1000-01-01'), - (np.arange(2), 'days since 1000-01-01'), - (np.arange(0, 100000, 20000), 'days since 1900-01-01'), - (17093352.0, 'hours since 1-1-1 00:00:0.0'), - ([0.5, 1.5], 'hours since 1900-01-01T00:00:00'), - (0, 'milliseconds since 2000-01-01T00:00:00'), - (0, 'microseconds since 2000-01-01T00:00:00'), - ]: - for calendar in ['standard', 'gregorian', 'proleptic_gregorian']: - expected = _ensure_naive_tz(nc4.num2date(num_dates, units, calendar)) - print(num_dates, units, calendar) - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', - 'Unable to decode time axis') - actual = conventions.decode_cf_datetime(num_dates, units, - calendar) - if (isinstance(actual, np.ndarray) and - np.issubdtype(actual.dtype, np.datetime64)): - # self.assertEqual(actual.dtype.kind, 'M') - # For some reason, numpy 1.8 does not compare ns precision - # datetime64 arrays as equal to arrays of datetime objects, - # but it works for us precision. Thus, convert to us - # precision for the actual array equal comparison... - actual_cmp = actual.astype('M8[us]') - else: - actual_cmp = actual - self.assertArrayEqual(expected, actual_cmp) - encoded, _, _ = conventions.encode_cf_datetime(actual, units, - calendar) - if '1-1-1' not in units: - # pandas parses this date very strangely, so the original - # units/encoding cannot be preserved in this case: - # (Pdb) pd.to_datetime('1-1-1 00:00:0.0') - # Timestamp('2001-01-01 00:00:00') - self.assertArrayEqual(num_dates, np.around(encoded, 1)) - if (hasattr(num_dates, 'ndim') and num_dates.ndim == 1 and - '1000' not in units): - # verify that wrapping with a pandas.Index works - # note that it *does not* currently work to even put - # non-datetime64 compatible dates into a pandas.Index :( - encoded, _, _ = conventions.encode_cf_datetime( - pd.Index(actual), units, calendar) - self.assertArrayEqual(num_dates, np.around(encoded, 1)) - - @requires_netCDF4 - def test_decode_cf_datetime_overflow(self): - # checks for - # https://github.com/pydata/pandas/issues/14068 - # https://github.com/pydata/xarray/issues/975 - - from datetime import datetime - units = 'days since 2000-01-01 00:00:00' - - # date after 2262 and before 1678 - days = (-117608, 95795) - expected = (datetime(1677, 12, 31), datetime(2262, 4, 12)) - - for i, day in enumerate(days): - result = conventions.decode_cf_datetime(day, units) - self.assertEqual(result, expected[i]) - - @requires_netCDF4 - def test_decode_cf_datetime_transition_to_invalid(self): - # manually create dataset with not-decoded date - from datetime import datetime - ds = Dataset(coords={'time': [0, 266 * 365]}) - units = 'days since 2000-01-01 00:00:00' - ds.time.attrs = dict(units=units) - ds_decoded = conventions.decode_cf(ds) - - expected = [datetime(2000, 1, 1, 0, 0), - datetime(2265, 10, 28, 0, 0)] - - self.assertArrayEqual(ds_decoded.time.values, expected) - - def test_decoded_cf_datetime_array(self): - actual = conventions.DecodedCFDatetimeArray( - np.array([0, 1, 2]), 'days since 1900-01-01', 'standard') - expected = pd.date_range('1900-01-01', periods=3).values - self.assertEqual(actual.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(actual, expected) - - # default calendar - actual = conventions.DecodedCFDatetimeArray( - np.array([0, 1, 2]), 'days since 1900-01-01') - self.assertEqual(actual.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(actual, expected) - - def test_slice_decoded_cf_datetime_array(self): - actual = conventions.DecodedCFDatetimeArray( - np.array([0, 1, 2]), 'days since 1900-01-01', 'standard') - expected = pd.date_range('1900-01-01', periods=3).values - self.assertEqual(actual.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(actual[B[0:2]], expected[slice(0, 2)]) - - actual = conventions.DecodedCFDatetimeArray( - np.array([0, 1, 2]), 'days since 1900-01-01', 'standard') - expected = pd.date_range('1900-01-01', periods=3).values - self.assertEqual(actual.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(actual[O[np.array([0, 2])]], expected[[0, 2]]) - - def test_decode_cf_datetime_non_standard_units(self): - expected = pd.date_range(periods=100, start='1970-01-01', freq='h') - # netCDFs from madis.noaa.gov use this format for their time units - # they cannot be parsed by netcdftime, but pd.Timestamp works - units = 'hours since 1-1-1970' - actual = conventions.decode_cf_datetime(np.arange(100), units) - self.assertArrayEqual(actual, expected) - - def test_decode_cf_with_conflicting_fill_missing_value(self): - var = Variable(['t'], np.arange(10), - {'units': 'foobar', - 'missing_value': 0, - '_FillValue': 1}) - with raises_regex(ValueError, "_FillValue and missing_value"): - conventions.decode_cf_variable('t', var) - - var = Variable(['t'], np.arange(10), - {'units': 'foobar', - 'missing_value': np.nan, - '_FillValue': np.nan}) - var = conventions.decode_cf_variable('t', var) - self.assertIsNotNone(var) - - var = Variable(['t'], np.arange(10), - {'units': 'foobar', - 'missing_value': np.float32(np.nan), - '_FillValue': np.float32(np.nan)}) - var = conventions.decode_cf_variable('t', var) - self.assertIsNotNone(var) - - @requires_netCDF4 - def test_decode_cf_datetime_non_iso_strings(self): - # datetime strings that are _almost_ ISO compliant but not quite, - # but which netCDF4.num2date can still parse correctly - expected = pd.date_range(periods=100, start='2000-01-01', freq='h') - cases = [(np.arange(100), 'hours since 2000-01-01 0'), - (np.arange(100), 'hours since 2000-1-1 0'), - (np.arange(100), 'hours since 2000-01-01 0:00')] - for num_dates, units in cases: - actual = conventions.decode_cf_datetime(num_dates, units) - self.assertArrayEqual(actual, expected) - - @requires_netCDF4 - def test_decode_non_standard_calendar(self): - import netCDF4 as nc4 - - for calendar in ['noleap', '365_day', '360_day', 'julian', 'all_leap', - '366_day']: - units = 'days since 0001-01-01' - times = pd.date_range('2001-04-01-00', end='2001-04-30-23', - freq='H') - noleap_time = nc4.date2num(times.to_pydatetime(), units, - calendar=calendar) - expected = times.values - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'Unable to decode time axis') - actual = conventions.decode_cf_datetime(noleap_time, units, - calendar=calendar) - self.assertEqual(actual.dtype, np.dtype('M8[ns]')) - abs_diff = abs(actual - expected) - # once we no longer support versions of netCDF4 older than 1.1.5, - # we could do this check with near microsecond accuracy: - # https://github.com/Unidata/netcdf4-python/issues/355 - self.assertTrue((abs_diff <= np.timedelta64(1, 's')).all()) - - @requires_netCDF4 - def test_decode_non_standard_calendar_single_element(self): - units = 'days since 0001-01-01' - for calendar in ['noleap', '365_day', '360_day', 'julian', 'all_leap', - '366_day']: - for num_time in [735368, [735368], [[735368]]]: - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', - 'Unable to decode time axis') - actual = conventions.decode_cf_datetime(num_time, units, - calendar=calendar) - self.assertEqual(actual.dtype, np.dtype('M8[ns]')) - - @requires_netCDF4 - def test_decode_non_standard_calendar_single_element_fallback(self): - import netCDF4 as nc4 - - units = 'days since 0001-01-01' - dt = nc4.netcdftime.datetime(2001, 2, 29) - for calendar in ['360_day', 'all_leap', '366_day']: - num_time = nc4.date2num(dt, units, calendar) - with self.assertWarns('Unable to decode time axis'): - actual = conventions.decode_cf_datetime(num_time, units, - calendar=calendar) - expected = np.asarray(nc4.num2date(num_time, units, calendar)) - print(num_time, calendar, actual, expected) - self.assertEqual(actual.dtype, np.dtype('O')) - self.assertEqual(expected, actual) - - @requires_netCDF4 - def test_decode_non_standard_calendar_multidim_time(self): - import netCDF4 as nc4 - - calendar = 'noleap' - units = 'days since 0001-01-01' - times1 = pd.date_range('2001-04-01', end='2001-04-05', freq='D') - times2 = pd.date_range('2001-05-01', end='2001-05-05', freq='D') - noleap_time1 = nc4.date2num(times1.to_pydatetime(), units, - calendar=calendar) - noleap_time2 = nc4.date2num(times2.to_pydatetime(), units, - calendar=calendar) - mdim_time = np.empty((len(noleap_time1), 2), ) - mdim_time[:, 0] = noleap_time1 - mdim_time[:, 1] = noleap_time2 - - expected1 = times1.values - expected2 = times2.values - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'Unable to decode time axis') - actual = conventions.decode_cf_datetime(mdim_time, units, - calendar=calendar) - self.assertEqual(actual.dtype, np.dtype('M8[ns]')) - self.assertArrayEqual(actual[:, 0], expected1) - self.assertArrayEqual(actual[:, 1], expected2) - - @requires_netCDF4 - def test_decode_non_standard_calendar_fallback(self): - import netCDF4 as nc4 - # ensure leap year doesn't matter - for year in [2010, 2011, 2012, 2013, 2014]: - for calendar in ['360_day', '366_day', 'all_leap']: - calendar = '360_day' - units = 'days since {0}-01-01'.format(year) - num_times = np.arange(100) - expected = nc4.num2date(num_times, units, calendar) - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - actual = conventions.decode_cf_datetime(num_times, units, - calendar=calendar) - self.assertEqual(len(w), 1) - self.assertIn('Unable to decode time axis', - str(w[0].message)) - - self.assertEqual(actual.dtype, np.dtype('O')) - self.assertArrayEqual(actual, expected) - - @requires_netCDF4 - def test_cf_datetime_nan(self): - for num_dates, units, expected_list in [ - ([np.nan], 'days since 2000-01-01', ['NaT']), - ([np.nan, 0], 'days since 2000-01-01', - ['NaT', '2000-01-01T00:00:00Z']), - ([np.nan, 0, 1], 'days since 2000-01-01', - ['NaT', '2000-01-01T00:00:00Z', '2000-01-02T00:00:00Z']), - ]: - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'All-NaN') - actual = conventions.decode_cf_datetime(num_dates, units) - expected = np.array(expected_list, dtype='datetime64[ns]') - self.assertArrayEqual(expected, actual) - - @requires_netCDF4 - def test_decoded_cf_datetime_array_2d(self): - # regression test for GH1229 - array = conventions.DecodedCFDatetimeArray(np.array([[0, 1], [2, 3]]), - 'days since 2000-01-01') - assert array.dtype == 'datetime64[ns]' - expected = pd.date_range('2000-01-01', periods=4).values.reshape(2, 2) - self.assertArrayEqual(np.asarray(array), expected) - - def test_infer_datetime_units(self): - for dates, expected in [(pd.date_range('1900-01-01', periods=5), - 'days since 1900-01-01 00:00:00'), - (pd.date_range('1900-01-01 12:00:00', freq='H', - periods=2), - 'hours since 1900-01-01 12:00:00'), - (['1900-01-01', '1900-01-02', - '1900-01-02 00:00:01'], - 'seconds since 1900-01-01 00:00:00'), - (pd.to_datetime(['1900-01-01', '1900-01-02', 'NaT']), - 'days since 1900-01-01 00:00:00'), - (pd.to_datetime(['1900-01-01', - '1900-01-02T00:00:00.005']), - 'seconds since 1900-01-01 00:00:00'), - (pd.to_datetime(['NaT', '1900-01-01']), - 'days since 1900-01-01 00:00:00'), - (pd.to_datetime(['NaT']), - 'days since 1970-01-01 00:00:00'), - ]: - self.assertEqual(expected, conventions.infer_datetime_units(dates)) - - def test_cf_timedelta(self): - examples = [ - ('1D', 'days', np.int64(1)), - (['1D', '2D', '3D'], 'days', np.array([1, 2, 3], 'int64')), - ('1h', 'hours', np.int64(1)), - ('1ms', 'milliseconds', np.int64(1)), - ('1us', 'microseconds', np.int64(1)), - (['NaT', '0s', '1s'], None, [np.nan, 0, 1]), - (['30m', '60m'], 'hours', [0.5, 1.0]), - (np.timedelta64('NaT', 'ns'), 'days', np.nan), - (['NaT', 'NaT'], 'days', [np.nan, np.nan]), - ] - - for timedeltas, units, numbers in examples: - timedeltas = pd.to_timedelta(timedeltas, box=False) - numbers = np.array(numbers) - - expected = numbers - actual, _ = conventions.encode_cf_timedelta(timedeltas, units) - self.assertArrayEqual(expected, actual) - self.assertEqual(expected.dtype, actual.dtype) - - if units is not None: - expected = timedeltas - actual = conventions.decode_cf_timedelta(numbers, units) - self.assertArrayEqual(expected, actual) - self.assertEqual(expected.dtype, actual.dtype) - - expected = np.timedelta64('NaT', 'ns') - actual = conventions.decode_cf_timedelta(np.array(np.nan), 'days') - self.assertArrayEqual(expected, actual) - - def test_cf_timedelta_2d(self): - timedeltas, units, numbers = ['1D', '2D', '3D'], 'days', np.atleast_2d([1, 2, 3]) - - timedeltas = np.atleast_2d(pd.to_timedelta(timedeltas, box=False)) - expected = timedeltas - - actual = conventions.decode_cf_timedelta(numbers, units) - self.assertArrayEqual(expected, actual) - self.assertEqual(expected.dtype, actual.dtype) - - def test_infer_timedelta_units(self): - for deltas, expected in [ - (pd.to_timedelta(['1 day', '2 days']), 'days'), - (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'), - (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'), - (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]: - self.assertEqual(expected, conventions.infer_timedelta_units(deltas)) - - def test_invalid_units_raises_eagerly(self): - ds = Dataset({'time': ('time', [0, 1], {'units': 'foobar since 123'})}) - with raises_regex(ValueError, 'unable to decode time'): - decode_cf(ds) - - @requires_netCDF4 - def test_dataset_repr_with_netcdf4_datetimes(self): - # regression test for #347 - attrs = {'units': 'days since 0001-01-01', 'calendar': 'noleap'} - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'unable to decode time') - ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) - self.assertIn('(time) object', repr(ds)) - - attrs = {'units': 'days since 1900-01-01'} - ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) - self.assertIn('(time) datetime64[ns]', repr(ds)) - - # this should not throw a warning (GH1111) - with warnings.catch_warnings(): - warnings.filterwarnings('error') - conventions.DecodedCFDatetimeArray(np.asarray([722624]), - "days since 0001-01-01") - - class TestNativeEndiannessArray(TestCase): def test(self): x = np.arange(5, dtype='>i8') @@ -595,6 +155,31 @@ def test(self): self.assertArrayEqual(a, expected) +def test_decode_cf_with_conflicting_fill_missing_value(): + var = Variable(['t'], np.arange(10), + {'units': 'foobar', + 'missing_value': 0, + '_FillValue': 1}) + with raises_regex(ValueError, "_FillValue and missing_value"): + conventions.decode_cf_variable('t', var) + + expected = Variable(['t'], np.arange(10), {'units': 'foobar'}) + + var = Variable(['t'], np.arange(10), + {'units': 'foobar', + 'missing_value': np.nan, + '_FillValue': np.nan}) + actual = conventions.decode_cf_variable('t', var) + assert_identical(actual, expected) + + var = Variable(['t'], np.arange(10), + {'units': 'foobar', + 'missing_value': np.float32(np.nan), + '_FillValue': np.float32(np.nan)}) + actual = conventions.decode_cf_variable('t', var) + assert_identical(actual, expected) + + @requires_netCDF4 class TestEncodeCFVariable(TestCase): def test_incompatible_attributes(self): @@ -675,6 +260,38 @@ def test_decode_cf_with_drop_variables(self): self.assertDatasetIdentical(expected, actual) self.assertDatasetIdentical(expected, actual2) + def test_invalid_time_units_raises_eagerly(self): + ds = Dataset({'time': ('time', [0, 1], {'units': 'foobar since 123'})}) + with raises_regex(ValueError, 'unable to decode time'): + decode_cf(ds) + + @requires_netCDF4 + def test_dataset_repr_with_netcdf4_datetimes(self): + # regression test for #347 + attrs = {'units': 'days since 0001-01-01', 'calendar': 'noleap'} + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'unable to decode time') + ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) + self.assertIn('(time) object', repr(ds)) + + attrs = {'units': 'days since 1900-01-01'} + ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) + self.assertIn('(time) datetime64[ns]', repr(ds)) + + @requires_netCDF4 + def test_decode_cf_datetime_transition_to_invalid(self): + # manually create dataset with not-decoded date + from datetime import datetime + ds = Dataset(coords={'time': [0, 266 * 365]}) + units = 'days since 2000-01-01 00:00:00' + ds.time.attrs = dict(units=units) + ds_decoded = conventions.decode_cf(ds) + + expected = [datetime(2000, 1, 1, 0, 0), + datetime(2265, 10, 28, 0, 0)] + + self.assertArrayEqual(ds_decoded.time.values, expected) + class CFEncodedInMemoryStore(WritableCFDataStore, InMemoryDataStore): pass diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index fd4244c4f9f..eb37cbe2b26 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -13,9 +13,9 @@ from xarray import (align, broadcast, Dataset, DataArray, IndexVariable, Variable) +from xarray.coding.times import CFDatetimeCoder from xarray.core.pycompat import iteritems, OrderedDict from xarray.core.common import full_like -from xarray.conventions import maybe_encode_datetime from xarray.tests import ( TestCase, ReturnItem, source_ndarray, unittest, requires_dask, assert_identical, assert_equal, assert_allclose, assert_array_equal, @@ -2891,7 +2891,7 @@ def test_to_and_from_iris(self): original_coord = original.coords[orginal_key] self.assertEqual(coord.var_name, original_coord.name) self.assertArrayEqual(coord.points, - maybe_encode_datetime(original_coord).values) + CFDatetimeCoder().encode(original_coord).values) self.assertEqual(actual.coord_dims(coord), original.get_axis_num (original.coords[coord.var_name].dims)) @@ -2963,7 +2963,7 @@ def test_to_and_from_iris_dask(self): original_coord = original.coords[orginal_key] self.assertEqual(coord.var_name, original_coord.name) self.assertArrayEqual(coord.points, - maybe_encode_datetime(original_coord).values) + CFDatetimeCoder().encode(original_coord).values) self.assertEqual(actual.coord_dims(coord), original.get_axis_num (original.coords[coord.var_name].dims))