Skip to content

Commit

Permalink
Defaults autoclose=False for open_mfdataset
Browse files Browse the repository at this point in the history
This choice of default is to select standard xarray performance over
general removal of the OSError associated with opening too many files as
encountered using open_mfdataset
  • Loading branch information
pwolfram committed Feb 5, 2017
1 parent 095fd6a commit 73b601d
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 29 deletions.
6 changes: 6 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ v0.9.2 (unreleased)
Enhancements
~~~~~~~~~~~~

- It is now possible to set the ``autoclose=True`` argument to
:py:func:`~xarray.open_mfdataset` to explicitly close opened files when not
in use to prevent occurrence of an OS Error related to too many open files.
Note, the default is ``autoclose=False``, which is consistent with previous
xarray behavior. By `Phillip J. Wolfram <https://github.com/pwolfram>`_.

Bug fixes
~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def _protect_dataset_variables_inplace(dataset, cache):


def open_dataset(filename_or_obj, group=None, decode_cf=True,
mask_and_scale=True, decode_times=True, autoclose=True,
mask_and_scale=True, decode_times=True, autoclose=False,
concat_characters=True, decode_coords=True, engine=None,
chunks=None, lock=None, cache=None, drop_variables=None):
"""Load and decode a dataset from a file or file-like object.
Expand Down
64 changes: 36 additions & 28 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,17 +1077,19 @@ def test_open_mfdataset(self):
with create_tmp_file() as tmp2:
original.isel(x=slice(5)).to_netcdf(tmp1)
original.isel(x=slice(5, 10)).to_netcdf(tmp2)
with open_mfdataset([tmp1, tmp2]) as actual:
self.assertIsInstance(actual.foo.variable.data, da.Array)
self.assertEqual(actual.foo.variable.data.chunks,
((5, 5),))
self.assertDatasetAllClose(original, actual)
with open_mfdataset([tmp1, tmp2], chunks={'x': 3}) as actual:
self.assertEqual(actual.foo.variable.data.chunks,
((3, 2, 3, 2),))
for close in [True, False]:
with open_mfdataset([tmp1, tmp2], autoclose=close) as actual:
self.assertIsInstance(actual.foo.variable.data, da.Array)
self.assertEqual(actual.foo.variable.data.chunks,
((5, 5),))
self.assertDatasetAllClose(original, actual)
with open_mfdataset([tmp1, tmp2], chunks={'x': 3}, autoclose=close) as actual:
self.assertEqual(actual.foo.variable.data.chunks,
((3, 2, 3, 2),))

with self.assertRaisesRegexp(IOError, 'no files to open'):
open_mfdataset('foo-bar-baz-*.nc')
for close in [True, False]:
open_mfdataset('foo-bar-baz-*.nc', autoclose=close)

def test_preprocess_mfdataset(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
Expand All @@ -1098,8 +1100,9 @@ def preprocess(ds):
return ds.assign_coords(z=0)

expected = preprocess(original)
with open_mfdataset(tmp, preprocess=preprocess) as actual:
self.assertDatasetIdentical(expected, actual)
for close in [True, False]:
with open_mfdataset(tmp, preprocess=preprocess, autoclose=close) as actual:
self.assertDatasetIdentical(expected, actual)

def test_save_mfdataset_roundtrip(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
Expand All @@ -1108,8 +1111,9 @@ def test_save_mfdataset_roundtrip(self):
with create_tmp_file() as tmp1:
with create_tmp_file() as tmp2:
save_mfdataset(datasets, [tmp1, tmp2])
with open_mfdataset([tmp1, tmp2]) as actual:
self.assertDatasetIdentical(actual, original)
for close in [True, False]:
with open_mfdataset([tmp1, tmp2], autoclose=close) as actual:
self.assertDatasetIdentical(actual, original)

def test_save_mfdataset_invalid(self):
ds = Dataset()
Expand All @@ -1122,18 +1126,21 @@ def test_open_and_do_math(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
with create_tmp_file() as tmp:
original.to_netcdf(tmp)
with open_mfdataset(tmp) as ds:
actual = 1.0 * ds
self.assertDatasetAllClose(original, actual)
for close in [True, False]:
with open_mfdataset(tmp, autoclose=close) as ds:
actual = 1.0 * ds
self.assertDatasetAllClose(original, actual)

def test_open_mfdataset_concat_dim_none(self):
with create_tmp_file() as tmp1:
with create_tmp_file() as tmp2:
data = Dataset({'x': 0})
data.to_netcdf(tmp1)
Dataset({'x': np.nan}).to_netcdf(tmp2)
with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual:
self.assertDatasetIdentical(data, actual)
for close in [True, False]:
with open_mfdataset([tmp1, tmp2],
concat_dim=None, autoclose=close) as actual:
self.assertDatasetIdentical(data, actual)

def test_open_dataset(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
Expand Down Expand Up @@ -1165,16 +1172,17 @@ def test_deterministic_names(self):
with create_tmp_file() as tmp:
data = create_test_data()
data.to_netcdf(tmp)
with open_mfdataset(tmp) as ds:
original_names = dict((k, v.data.name)
for k, v in ds.data_vars.items())
with open_mfdataset(tmp) as ds:
repeat_names = dict((k, v.data.name)
for k, v in ds.data_vars.items())
for var_name, dask_name in original_names.items():
self.assertIn(var_name, dask_name)
self.assertIn(tmp, dask_name)
self.assertEqual(original_names, repeat_names)
for close in [True, False]:
with open_mfdataset(tmp, autoclose=close) as ds:
original_names = dict((k, v.data.name)
for k, v in ds.data_vars.items())
with open_mfdataset(tmp, autoclose=close) as ds:
repeat_names = dict((k, v.data.name)
for k, v in ds.data_vars.items())
for var_name, dask_name in original_names.items():
self.assertIn(var_name, dask_name)
self.assertIn(tmp, dask_name)
self.assertEqual(original_names, repeat_names)

def test_dataarray_compute(self):
# Test DataArray.compute() on dask backend.
Expand Down

0 comments on commit 73b601d

Please sign in to comment.