From 1460a070b4cecb103485955e617f8eb4a7d64c0c Mon Sep 17 00:00:00 2001 From: "Phillip J. Wolfram" Date: Tue, 31 Jan 2017 12:39:00 -0700 Subject: [PATCH] Defaults autoclose=False for open_mfdataset This choice of default is to select standard xarray performance over general removal of the OSError associated with opening too many files as encountered using open_mfdataset --- doc/whats-new.rst | 6 ++++ xarray/backends/api.py | 2 +- xarray/tests/test_backends.py | 66 ++++++++++++++++++++--------------- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4ddaf01286c..e59ff1d36fa 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,12 @@ v0.9.2 (unreleased) Enhancements ~~~~~~~~~~~~ +- It is now possible to set the ``autoclose=True`` argument to + :py:func:`~xarray.open_mfdataset` to explicitly close opened files when not + in use to prevent occurrence of an OS Error related to too many open files. + Note, the default is ``autoclose=False``, which is consistent with previous + xarray behavior. By `Phillip J. Wolfram `_. + Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d7b3b8d2100..dc741ea73d8 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -132,7 +132,7 @@ def _protect_dataset_variables_inplace(dataset, cache): def open_dataset(filename_or_obj, group=None, decode_cf=True, - mask_and_scale=True, decode_times=True, autoclose=True, + mask_and_scale=True, decode_times=True, autoclose=False, concat_characters=True, decode_coords=True, engine=None, chunks=None, lock=None, cache=None, drop_variables=None): """Load and decode a dataset from a file or file-like object. diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c238b6076db..adef78b93f7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1037,17 +1037,19 @@ def test_open_mfdataset(self): with create_tmp_file() as tmp2: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: - self.assertIsInstance(actual.foo.variable.data, da.Array) - self.assertEqual(actual.foo.variable.data.chunks, - ((5, 5),)) - self.assertDatasetAllClose(original, actual) - with open_mfdataset([tmp1, tmp2], chunks={'x': 3}) as actual: - self.assertEqual(actual.foo.variable.data.chunks, - ((3, 2, 3, 2),)) + for close in [True, False]: + with open_mfdataset([tmp1, tmp2], autoclose=close) as actual: + self.assertIsInstance(actual.foo.variable.data, da.Array) + self.assertEqual(actual.foo.variable.data.chunks, + ((5, 5),)) + self.assertDatasetAllClose(original, actual) + with open_mfdataset([tmp1, tmp2], chunks={'x': 3}, autoclose=close) as actual: + self.assertEqual(actual.foo.variable.data.chunks, + ((3, 2, 3, 2),)) with self.assertRaisesRegexp(IOError, 'no files to open'): - open_mfdataset('foo-bar-baz-*.nc') + for close in [True, False]: + open_mfdataset('foo-bar-baz-*.nc', autoclose=close) def test_open_mfdataset_large_num_files(self, nfiles=2000): original = Dataset({'foo': ('x', np.random.randn(nfiles))}) @@ -1056,7 +1058,7 @@ def test_open_mfdataset_large_num_files(self, nfiles=2000): # split into multiple sets of temp files for ii in original.x.values: original.isel(x=slice(ii, ii+1)).to_netcdf(tmpfiles[ii]) - open_mfdataset(tmpfiles) + open_mfdataset(tmpfiles, autoclose=True) def test_preprocess_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) @@ -1067,8 +1069,9 @@ def preprocess(ds): return ds.assign_coords(z=0) expected = preprocess(original) - with open_mfdataset(tmp, preprocess=preprocess) as actual: - self.assertDatasetIdentical(expected, actual) + for close in [True, False]: + with open_mfdataset(tmp, preprocess=preprocess, autoclose=close) as actual: + self.assertDatasetIdentical(expected, actual) def test_save_mfdataset_roundtrip(self): original = Dataset({'foo': ('x', np.random.randn(10))}) @@ -1077,8 +1080,9 @@ def test_save_mfdataset_roundtrip(self): with create_tmp_file() as tmp1: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2]) as actual: - self.assertDatasetIdentical(actual, original) + for close in [True, False]: + with open_mfdataset([tmp1, tmp2], autoclose=close) as actual: + self.assertDatasetIdentical(actual, original) def test_save_mfdataset_invalid(self): ds = Dataset() @@ -1091,9 +1095,10 @@ def test_open_and_do_math(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset(tmp) as ds: - actual = 1.0 * ds - self.assertDatasetAllClose(original, actual) + for close in [True, False]: + with open_mfdataset(tmp, autoclose=close) as ds: + actual = 1.0 * ds + self.assertDatasetAllClose(original, actual) def test_open_mfdataset_concat_dim_none(self): with create_tmp_file() as tmp1: @@ -1101,8 +1106,10 @@ def test_open_mfdataset_concat_dim_none(self): data = Dataset({'x': 0}) data.to_netcdf(tmp1) Dataset({'x': np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual: - self.assertDatasetIdentical(data, actual) + for close in [True, False]: + with open_mfdataset([tmp1, tmp2], + concat_dim=None, autoclose=close) as actual: + self.assertDatasetIdentical(data, actual) def test_open_dataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) @@ -1134,16 +1141,17 @@ def test_deterministic_names(self): with create_tmp_file() as tmp: data = create_test_data() data.to_netcdf(tmp) - with open_mfdataset(tmp) as ds: - original_names = dict((k, v.data.name) - for k, v in ds.data_vars.items()) - with open_mfdataset(tmp) as ds: - repeat_names = dict((k, v.data.name) - for k, v in ds.data_vars.items()) - for var_name, dask_name in original_names.items(): - self.assertIn(var_name, dask_name) - self.assertIn(tmp, dask_name) - self.assertEqual(original_names, repeat_names) + for close in [True, False]: + with open_mfdataset(tmp, autoclose=close) as ds: + original_names = dict((k, v.data.name) + for k, v in ds.data_vars.items()) + with open_mfdataset(tmp, autoclose=close) as ds: + repeat_names = dict((k, v.data.name) + for k, v in ds.data_vars.items()) + for var_name, dask_name in original_names.items(): + self.assertIn(var_name, dask_name) + self.assertIn(tmp, dask_name) + self.assertEqual(original_names, repeat_names) def test_dataarray_compute(self): # Test DataArray.compute() on dask backend.