-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Remove caching logic from xarray.Variable #1128
Changes from 10 commits
81f9b94
49135f2
7f70d15
8d19a16
5f5ca9e
c85dce7
95e6737
0379bfe
9a5364b
85f29cf
6fa043d
f804e2e
e3b9cd6
b102e20
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ dependencies: | |
- python=2.7 | ||
- cdat-lite | ||
- dask | ||
- distributed | ||
- pytest | ||
- numpy | ||
- pandas>=0.15.0 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ dependencies: | |
- python=2.7 | ||
- cython | ||
- dask | ||
- distributed | ||
- h5py | ||
- pytest | ||
- numpy | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ name: test_env | |
dependencies: | ||
- python=2.7 | ||
- dask | ||
- distributed | ||
- h5py | ||
- netcdf4 | ||
- pytest | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ dependencies: | |
- python=3.5 | ||
- cython | ||
- dask | ||
- distributed | ||
- h5py | ||
- matplotlib | ||
- netcdf4 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ | |
from ..core.utils import FrozenOrderedDict, close_on_error, Frozen | ||
from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict | ||
|
||
from .common import WritableCFDataStore | ||
from .common import WritableCFDataStore, DataStorePickleMixin | ||
from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, _extract_nc4_encoding, | ||
BaseNetCDF4Array) | ||
|
||
|
@@ -37,24 +37,32 @@ def _read_attributes(h5netcdf_var): | |
lsd_okay=False, backend='h5netcdf') | ||
|
||
|
||
class H5NetCDFStore(WritableCFDataStore): | ||
def _open_h5netcdf_group(filename, mode, group): | ||
import h5netcdf.legacyapi | ||
ds = h5netcdf.legacyapi.Dataset(filename, mode=mode) | ||
with close_on_error(ds): | ||
return _nc4_group(ds, group, mode) | ||
|
||
|
||
class H5NetCDFStore(WritableCFDataStore, DataStorePickleMixin): | ||
"""Store for reading and writing data via h5netcdf | ||
""" | ||
def __init__(self, filename, mode='r', format=None, group=None, | ||
writer=None): | ||
import h5netcdf.legacyapi | ||
if format not in [None, 'NETCDF4']: | ||
raise ValueError('invalid format for h5netcdf backend') | ||
ds = h5netcdf.legacyapi.Dataset(filename, mode=mode) | ||
with close_on_error(ds): | ||
self.ds = _nc4_group(ds, group, mode) | ||
opener = functools.partial(_open_h5netcdf_group, filename, mode=mode, | ||
group=group) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be worth noting that this is only cloud-picklable, not stdlib pickleable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you clarify why this won't work with stdlib pickle? Is the issue doing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, you're right. I'm surprised: In [1]: from operator import add
In [2]: from functools import partial
In [3]: from pickle import dumps, loads
In [4]: loads(dumps(partial(add, 1)))
Out[4]: functools.partial(<built-in function add>, 1)
In [5]: loads(dumps(partial(add, 1)))(2)
Out[5]: 3 |
||
self.ds = opener() | ||
self.format = format | ||
self._opener = opener | ||
self._filename = filename | ||
self._mode = mode | ||
super(H5NetCDFStore, self).__init__(writer) | ||
|
||
def open_store_variable(self, var): | ||
def open_store_variable(self, name, var): | ||
dimensions = var.dimensions | ||
data = indexing.LazilyIndexedArray(BaseNetCDF4Array(var)) | ||
data = indexing.LazilyIndexedArray(BaseNetCDF4Array(name, self)) | ||
attrs = _read_attributes(var) | ||
|
||
# netCDF4 specific encoding | ||
|
@@ -69,7 +77,7 @@ def open_store_variable(self, var): | |
return Variable(dimensions, data, attrs, encoding) | ||
|
||
def get_variables(self): | ||
return FrozenOrderedDict((k, self.open_store_variable(v)) | ||
return FrozenOrderedDict((k, self.open_store_variable(k, v)) | ||
for k, v in iteritems(self.ds.variables)) | ||
|
||
def get_attrs(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I ran a test where I create a
DataSet
from a custom data store which initializesVariables
using dask arrays fordata
. In this case the dask arrays is still converted to an ndarray when accessing theVariable
'sdata
property, since it checks is for a dask array type, however here the array is wrapped into aCopyOnWriteArray
, which meansVariable.values
is called, which loads eagerly.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, you would need to use
cache=False
in such a case.Xarray's decoding logic in
conventions.py
uses it's own array objects instead of dask arrays (for reasons I could get into), which unfortunately makes using dask.array objects to produce variables in a custom data store non-ideal. The problem is that the graphs from such dask arrays don't get linked up into xarray, which means that even if you rechunk the arrays in the xarray Dataset, they still get executed separately by dask. Duck typing for dask objects would probably help here (dask/dask#1068) .