-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Disable automatic cache with dask #1024
Changes from 11 commits
f3d74e8
26a6997
03fbdd1
b04167c
ca94cc7
91b8084
e46b61f
90743f0
30fbd8f
ac8e0cb
e7f600c
2d85d90
28f1a6e
25569df
27b0916
376200a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -255,8 +255,11 @@ def load_store(cls, store, decoder=None): | |
return obj | ||
|
||
def __getstate__(self): | ||
"""Always load data in-memory before pickling""" | ||
self.load() | ||
"""Load data in-memory before pickling (except for Dask data)""" | ||
for v in self.variables.values(): | ||
if not isinstance(v.data, dask_array_type): | ||
v.load() | ||
|
||
# self.__dict__ is the default pickle object, we don't need to | ||
# implement our own __setstate__ method to make pickle work | ||
state = self.__dict__.copy() | ||
|
@@ -319,6 +322,19 @@ def load(self): | |
|
||
return self | ||
|
||
def compute(self): | ||
"""Manually trigger loading of this dataset's data from disk or a | ||
remote source into memory and return a new dataset. The original is | ||
left unaltered. | ||
|
||
Normally, it should not be necessary to call this method in user code, | ||
because all xarray functions should either work on deferred data or | ||
load data automatically. However, this method can be necessary when | ||
working with many file objects on disk. | ||
""" | ||
new = self.copy(deep=False) | ||
return new.load() | ||
|
||
@classmethod | ||
def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, | ||
file_obj=None): | ||
|
@@ -401,14 +417,12 @@ def copy(self, deep=False): | |
"""Returns a copy of this dataset. | ||
|
||
If `deep=True`, a deep copy is made of each of the component variables. | ||
Otherwise, a shallow copy is made, so each variable in the new dataset | ||
is also a variable in the original dataset. | ||
Otherwise, a shallow copy of each of the component variable is made, so | ||
that the underlying memory region of the new dataset is the same as in | ||
the original dataset. | ||
""" | ||
if deep: | ||
variables = OrderedDict((k, v.copy(deep=True)) | ||
for k, v in iteritems(self._variables)) | ||
else: | ||
variables = self._variables.copy() | ||
variables = OrderedDict((k, v.copy(deep=deep)) | ||
for k, v in iteritems(self._variables)) | ||
# skip __init__ to avoid costly validation | ||
return self._construct_direct(variables, self._coord_names.copy(), | ||
self._dims.copy(), self._attrs_copy()) | ||
|
@@ -792,13 +806,19 @@ def chunks(self): | |
array. | ||
""" | ||
chunks = {} | ||
for v in self.variables.values(): | ||
for v in self.data_vars.values(): | ||
if v.chunks is not None: | ||
new_chunks = list(zip(v.dims, v.chunks)) | ||
if any(chunk != chunks[d] for d, chunk in new_chunks | ||
if d in chunks): | ||
raise ValueError('inconsistent chunks') | ||
chunks.update(new_chunks) | ||
if chunks: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why should this need I might simply make this:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if none of the data_vars use the dask backend, then you want chunks to return None. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this method is inconsistent with I would either skip this change or use something like my version. |
||
# Add dims that are defined in the coords but are not in data_vars | ||
for v in self.coords.values(): | ||
for dim in v.dims: | ||
if dim not in chunks: | ||
chunks[dim] = (v.size,) | ||
return Frozen(SortedKeysDict(chunks)) | ||
|
||
def chunk(self, chunks=None, name_prefix='xarray-', token=None, | ||
|
@@ -851,6 +871,9 @@ def selkeys(dict_, keys): | |
return dict((d, dict_[d]) for d in keys if d in dict_) | ||
|
||
def maybe_chunk(name, var, chunks): | ||
if name not in self.data_vars: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see your point about performance, but I think that mostly holds true for indexes. So I would be inclined to adjust this to only skip variables in I am still concerned about skipping coords if they are already dask arrays. If they are already dask arrays, then |
||
return var | ||
|
||
chunks = selkeys(chunks, var.dims) | ||
if not chunks: | ||
chunks = None | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -274,10 +274,21 @@ def data(self, data): | |
"replacement data must match the Variable's shape") | ||
self._data = data | ||
|
||
def _data_cast(self): | ||
if isinstance(self._data, (np.ndarray, PandasIndexAdapter)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this branch not also apply to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact, if you manually create a |
||
return self._data | ||
else: | ||
return np.asarray(self._data) | ||
|
||
def _data_cached(self): | ||
if not isinstance(self._data, (np.ndarray, PandasIndexAdapter)): | ||
self._data = np.asarray(self._data) | ||
return self._data | ||
"""Load data into memory and return it. | ||
Do not cache dask arrays automatically; that should | ||
require an explicit load() call. | ||
""" | ||
new_data = self._data_cast() | ||
if not isinstance(self._data, dask_array_type): | ||
self._data = new_data | ||
return new_data | ||
|
||
@property | ||
def _indexable_data(self): | ||
|
@@ -291,12 +302,26 @@ def load(self): | |
because all xarray functions should either work on deferred data or | ||
load data automatically. | ||
""" | ||
self._data_cached() | ||
self._data = self._data_cast() | ||
return self | ||
|
||
def compute(self): | ||
"""Manually trigger loading of this variable's data from disk or a | ||
remote source into memory and return a new variable. The original is | ||
left unaltered. | ||
|
||
Normally, it should not be necessary to call this method in user code, | ||
because all xarray functions should either work on deferred data or | ||
load data automatically. | ||
""" | ||
new = self.copy(deep=False) | ||
return new.load() | ||
|
||
def __getstate__(self): | ||
"""Always cache data as an in-memory array before pickling""" | ||
self._data_cached() | ||
"""Always cache data as an in-memory array before pickling | ||
(with the exception of dask backend)""" | ||
if not isinstance(self._data, dask_array_type): | ||
self._data_cached() | ||
# self.__dict__ is the default pickle object, we don't need to | ||
# implement our own __setstate__ method to make pickle work | ||
return self.__dict__ | ||
|
@@ -1102,10 +1127,11 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): | |
raise ValueError('%s objects must be 1-dimensional' % | ||
type(self).__name__) | ||
|
||
def _data_cached(self): | ||
if not isinstance(self._data, PandasIndexAdapter): | ||
self._data = PandasIndexAdapter(self._data) | ||
return self._data | ||
def _data_cast(self): | ||
if isinstance(self._data, PandasIndexAdapter): | ||
return self._data | ||
else: | ||
return PandasIndexAdapter(self._data) | ||
|
||
def __getitem__(self, key): | ||
key = self._item_key_to_tuple(key) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am concerned about skipping non-
data_vars
here. Coordinates could still be chunked, e.g., if they were loaded from a file, or created directly from dask arrays.