Skip to content

Commit

Permalink
Allow for pd.TimedeltaIndex and serialize it to netCDF
Browse files Browse the repository at this point in the history
Fixes GH55
  • Loading branch information
shoyer committed Dec 12, 2014
1 parent a31e0e5 commit 4f3fa30
Show file tree
Hide file tree
Showing 9 changed files with 154 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ matrix:
fast_finish: true
include:
- python: 2.6
env: UPDATE_ENV="conda install unittest2 pandas==0.13.1"
env: UPDATE_ENV="conda install unittest2 pandas==0.15.0"
# Test on Python 2.7 with and without netCDF4/scipy
- python: 2.7
env: UPDATE_ENV="pip install cyordereddict"
Expand Down
118 changes: 93 additions & 25 deletions xray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,23 +147,48 @@ def nan_safe_num2date(num):
return dates


def guess_time_units(dates):
"""Given an array of dates suitable for input to `pandas.DatetimeIndex`,
returns a CF compatible time-unit string of the form "{time_unit} since
{date[0]}", where `time_unit` is 'days', 'hours', 'minutes' or 'seconds'
(the first one that can evenly divide all unique time deltas in `dates`)
def decode_cf_timedelta(num_timedeltas, units):
"""Given an array of numeric timedeltas in netCDF format, convert it into a
numpy timedelta64[ns] array.
"""
dates = pd.DatetimeIndex(np.asarray(dates).reshape(-1))
unique_timedeltas = np.unique(np.diff(dates.values[pd.notnull(dates)]))
# rename 'seconds', 'minutes' and 'hours' to formats pandas recognizes
units = {'seconds': 's', 'minutes': 'm', 'hours': 'h'}.get(units, units)
return pd.to_timedelta(np.asarray(num_timedeltas), unit=units, box=False)


TIME_UNITS = set(['days', 'hours', 'minutes', 'seconds'])

def _infer_time_units_from_diff(unique_timedeltas):
for time_unit, delta in [('days', 86400), ('hours', 3600),
('minutes', 60), ('seconds', 1)]:
unit_delta = np.timedelta64(10 ** 9 * delta, 'ns')
diffs = unique_timedeltas / unit_delta
if np.all(diffs == diffs.astype(int)):
break
else:
raise ValueError('could not automatically determine time units')
return '%s since %s' % (time_unit, dates[0])
return time_unit
raise ValueError('could not automatically determine time units')


def infer_datetime_units(dates):
"""Given an array of datetimes, returns a CF compatible time-unit string of
the form "{time_unit} since {date[0]}", where `time_unit` is 'days',
'hours', 'minutes' or 'seconds' (the first one that can evenly divide all
unique time deltas in `dates`)
"""
dates = pd.to_datetime(dates, box=False)
unique_timedeltas = np.unique(np.diff(dates[pd.notnull(dates)]))
units = _infer_time_units_from_diff(unique_timedeltas)
return '%s since %s' % (units, pd.Timestamp(dates[0]))


def infer_timedelta_units(deltas):
"""Given an array of timedeltas, returns a CF compatible time-unit from
{'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly
divide all unique time deltas in `deltas`)
"""
deltas = pd.to_timedelta(deltas, box=False)
unique_timedeltas = np.unique(deltas[pd.notnull(deltas)])
units = _infer_time_units_from_diff(unique_timedeltas)
return units


def nctime_to_nptime(times):
Expand Down Expand Up @@ -193,7 +218,7 @@ def encode_cf_datetime(dates, units=None, calendar=None):
dates = np.asarray(dates)

if units is None:
units = guess_time_units(dates)
units = infer_datetime_units(dates)
if calendar is None:
calendar = 'proleptic_gregorian'

Expand All @@ -211,6 +236,21 @@ def encode_datetime(d):
return (num, units, calendar)


def encode_cf_timedelta(timedeltas, units=None):
if units is None:
units = infer_timedelta_units(timedeltas)

np_unit = {'seconds': 's', 'minutes': 'm', 'hours': 'h', 'days': 'D'}[units]
num = timedeltas.astype('timedelta64[%s]' % np_unit).view(np.int64)

missing = pd.isnull(timedeltas)
if np.any(missing):
num = num.astype(float)
num[missing] = np.nan

return (num, units)


class MaskedAndScaledArray(utils.NDArrayMixin):
"""Wrapper around array-like objects to create a new indexable object where
values, when accessesed, are automatically scaled and masked according to
Expand Down Expand Up @@ -288,6 +328,23 @@ def __getitem__(self, key):
calendar=self.calendar)


class DecodedCFTimedeltaArray(utils.NDArrayMixin):
"""Wrapper around array-like objects to create a new indexable object where
values, when accessesed, are automatically converted into timedelta objects
using decode_cf_timedelta.
"""
def __init__(self, array, units):
self.array = array
self.units = units

@property
def dtype(self):
return np.dtype('timedelta64[ns]')

def __getitem__(self, key):
return decode_cf_timedelta(self.array[key], units=self.units)


class CharToStringArray(utils.NDArrayMixin):
"""Wrapper around array-like objects to create a new indexable object where
values, when accessesed, are automatically concatenated along the last
Expand Down Expand Up @@ -358,7 +415,7 @@ def char_to_string(arr):
return arr.view(kind + str(arr.shape[-1]))[..., 0]


def _safe_setitem(dest, key, value):
def safe_setitem(dest, key, value):
if key in dest:
raise ValueError('Failed hard to prevent overwriting key %r' % key)
dest[key] = value
Expand All @@ -370,9 +427,9 @@ def pop_to(source, dest, key, default=None):
None values are not passed on. If k already exists in dest an
error is raised.
"""
value = source.pop(key, default)
value = source.pop(key, None)
if value is not None:
_safe_setitem(dest, key, value)
safe_setitem(dest, key, value)
return value


Expand All @@ -384,16 +441,21 @@ def maybe_encode_datetime(var):
if (np.issubdtype(var.dtype, np.datetime64)
or (var.dtype.kind == 'O'
and isinstance(var.values.flat[0], datetime))):

dims, values, attrs, encoding = _var_as_tuple(var)
if 'units' in attrs or 'calendar' in attrs:
raise ValueError(
"Failed hard to prevent overwriting 'units' or 'calendar'")

(values, units, calendar) = encode_cf_datetime(
values, encoding.pop('units', None), encoding.pop('calendar', None))
attrs['units'] = units
attrs['calendar'] = calendar
safe_setitem(attrs, 'units', units)
safe_setitem(attrs, 'calendar', calendar)
var = Variable(dims, values, attrs, encoding)
return var


def maybe_encode_timedelta(var):
if np.issubdtype(var.dtype, np.timedelta64):
dims, values, attrs, encoding = _var_as_tuple(var)
values, units = encode_cf_timedelta(
values, encoding.pop('units', None))
safe_setitem(attrs, 'units', units)
var = Variable(dims, values, attrs, encoding)
return var

Expand Down Expand Up @@ -452,7 +514,7 @@ def _infer_dtype(array):
else:
dtype = np.array(array.flat[0]).dtype
if dtype.kind in ['S', 'U']:
# don't just use inferred_dtype to avoid truncating arrays to
# don't just use inferred dtype to avoid truncating arrays to
# the length of their first element
dtype = np.dtype(dtype.kind)
elif dtype.kind == 'O':
Expand Down Expand Up @@ -511,6 +573,7 @@ def encode_cf_variable(var, needs_copy=True):
A variable which has been encoded as described above.
"""
var = maybe_encode_datetime(var)
var = maybe_encode_timedelta(var)
var, needs_copy = maybe_encode_offset_and_scale(var, needs_copy)
var, needs_copy = maybe_encode_fill_value(var, needs_copy)
var = maybe_encode_dtype(var, needs_copy)
Expand Down Expand Up @@ -585,11 +648,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
data = MaskedAndScaledArray(data, fill_value, scale_factor,
add_offset, dtype)

if decode_times:
if 'units' in attributes and 'since' in attributes['units']:
if decode_times and 'units' in attributes:
if 'since' in attributes['units']:
# datetime
units = pop_to(attributes, encoding, 'units')
calendar = pop_to(attributes, encoding, 'calendar')
data = DecodedCFDatetimeArray(data, units, calendar)
elif attributes['units'] in TIME_UNITS:
# timedelta
units = pop_to(attributes, encoding, 'units')
data = DecodedCFTimedeltaArray(data, units)

return Variable(dimensions, indexing.LazilyIndexedArray(data),
attributes, encoding=encoding)
Expand Down
5 changes: 2 additions & 3 deletions xray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,8 @@ def safe_cast_to_index(array):
index = array.to_index()
else:
kwargs = {}
if hasattr(array, 'dtype'):
if array.dtype == object or array.dtype.kind == 'm':
kwargs['dtype'] = object
if hasattr(array, 'dtype') and array.dtype.kind == 'O':
kwargs['dtype'] = object
index = pd.Index(np.asarray(array), **kwargs)
return index

Expand Down
13 changes: 11 additions & 2 deletions xray/core/variable.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import timedelta
import functools

import numpy as np
Expand Down Expand Up @@ -72,13 +73,15 @@ def _as_compatible_data(data):

if isinstance(data, pd.Timestamp):
# TODO: convert, handle datetime objects, too
data = np.datetime64(data, 'ns')
data = np.datetime64(data.value, 'ns')
if isinstance(data, timedelta):
data = np.timedelta64(getattr(data, 'value', data), 'ns')

# don't check for __len__ or __iter__ so as not to cast if data is a numpy
# numeric type like np.float32
required = ['dtype', 'shape', 'size', 'ndim']
if (any(not hasattr(data, attr) for attr in required)
or isinstance(data, (np.string_, np.datetime64))):
or isinstance(data, (np.string_, np.datetime64, np.timedelta64))):
# data must be ndarray-like
data = np.asarray(data)

Expand All @@ -103,6 +106,8 @@ def _as_compatible_data(data):
if data.dtype.kind == 'M':
# TODO: automatically cast arrays of datetime objects as well
data = np.asarray(data, 'datetime64[ns]')
if data.dtype.kind == 'm':
data = np.asarray(data, 'timedelta64[ns]')
data = NumpyArrayAdapter(data)

return data
Expand Down Expand Up @@ -170,6 +175,8 @@ def __getitem__(self, key):
# pd.Timestamp rather np.than datetime64 but this is easier
# (for now)
value = np.datetime64('NaT', 'ns')
elif isinstance(value, timedelta):
value = np.timedelta64(getattr(value, 'value', value), 'ns')
else:
value = np.asarray(value, dtype=self.dtype)
else:
Expand Down Expand Up @@ -205,6 +212,8 @@ def _as_array_or_item(data):
# convert to a np.datetime64 object, because 0-dimensional ndarrays
# with dtype=datetime64 are broken :(
data = np.datetime64(data, 'ns')
elif data.dtype.kind == 'm':
data = np.timedelta64(data, 'ns')
return data


Expand Down
12 changes: 7 additions & 5 deletions xray/test/test_backends.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
from xray.conventions import cf_decoder
try:
import cPickle as pickle
except ImportError:
import pickle
from io import BytesIO
import contextlib
import os.path
import pickle
import tempfile
import unittest
import sys
Expand Down Expand Up @@ -154,6 +150,12 @@ def test_roundtrip_datetime_data(self):
with self.roundtrip(expected) as actual:
self.assertDatasetIdentical(expected, actual)

def test_roundtrip_timedelta_data(self):
time_deltas = pd.to_timedelta(['1h', '2h', 'NaT'])
expected = Dataset({'td': ('td', time_deltas)})
with self.roundtrip(expected) as actual:
self.assertDatasetIdentical(expected, actual)

def test_roundtrip_example_1_netcdf(self):
expected = open_example_dataset('example_1.nc')
with self.roundtrip(expected) as actual:
Expand Down
14 changes: 12 additions & 2 deletions xray/test/test_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def test_cf_datetime_nan(self):
expected = np.array(expected_list, dtype='datetime64[ns]')
self.assertArrayEqual(expected, actual)

def test_guess_time_units(self):
def test_infer_datetime_units(self):
for dates, expected in [(pd.date_range('1900-01-01', periods=5),
'days since 1900-01-01 00:00:00'),
(pd.date_range('1900-01-01 12:00:00', freq='H',
Expand All @@ -275,14 +275,24 @@ def test_guess_time_units(self):
'seconds since 1900-01-01 00:00:00'),
(pd.to_datetime(['1900-01-01', '1900-01-02', 'NaT']),
'days since 1900-01-01 00:00:00')]:
self.assertEqual(expected, conventions.guess_time_units(dates))
self.assertEqual(expected, conventions.infer_datetime_units(dates))

def test_infer_timedelta_units(self):
for deltas, expected in [
(pd.to_timedelta(['1 day', '2 days']), 'days'),
(pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'),
(pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'),
(pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]:
self.assertEqual(expected, conventions.infer_timedelta_units(deltas))


@requires_netCDF4
class TestEncodeCFVariable(TestCase):
def test_incompatible_attributes(self):
invalid_vars = [
Variable(['t'], pd.date_range('2000-01-01', periods=3),
{'units': 'foobar'}),
Variable(['t'], pd.to_timedelta(['1 day']), {'units': 'foobar'}),
Variable(['t'], [0, 1, 2], {'add_offset': 0}, {'add_offset': 2}),
Variable(['t'], [0, 1, 2], {'_FillValue': 0}, {'_FillValue': 2}),
]
Expand Down
9 changes: 9 additions & 0 deletions xray/test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,15 @@ def test_sel(self):
self.assertDatasetEqual(data.isel(time=slice(3)),
data.sel(time=(data['time.dayofyear'] <= 3)))

td = pd.to_timedelta(np.arange(3), unit='days')
data = Dataset({'x': ('td', np.arange(3)), 'td': td})
self.assertDatasetEqual(data, data.sel(td=td))
self.assertDatasetEqual(data, data.sel(td=slice('3 days')))
self.assertDatasetEqual(data.isel(td=0), data.sel(td='0 days'))
self.assertDatasetEqual(data.isel(td=0), data.sel(td='0h'))
self.assertDatasetEqual(data.isel(td=slice(1, 3)),
data.sel(td=slice('1 days', '2 days')))

def test_loc(self):
data = create_test_data()
expected = data.sel(dim3='a')
Expand Down
5 changes: 3 additions & 2 deletions xray/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ class TestSafeCastToIndex(TestCase):
def test(self):
dates = pd.date_range('2000-01-01', periods=10)
x = np.arange(5)
timedeltas = x * np.timedelta64(1, 'D')
td = x * np.timedelta64(1, 'D')
for expected, array in [
(dates, dates.values),
(pd.Index(x, dtype=object), x.astype(object)),
(pd.Index(timedeltas, dtype=object), timedeltas),
(pd.Index(td), td),
(pd.Index(td, dtype=object), td.astype(object)),
]:
actual = utils.safe_cast_to_index(array)
self.assertArrayEqual(expected, actual)
Expand Down
Loading

0 comments on commit 4f3fa30

Please sign in to comment.