From 180cf58d42a23233b69cf0a71c6c3b28c7ca0365 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Tue, 19 Sep 2017 12:17:30 -0400 Subject: [PATCH 01/29] add data_vars option to open_mfdataset --- xarray/backends/api.py | 18 +++++++++++++++--- xarray/core/combine.py | 8 +++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e5a3136f0ca..82a26d0e4e0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -431,7 +431,7 @@ def close(self): def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, - lock=None, **kwargs): + lock=None, data_vars='all', **kwargs): """Open multiple files as a single dataset. Requires dask to be installed. Attributes from the first dataset file @@ -487,6 +487,18 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, default, a per-variable lock is used when reading data from netCDF files with the netcdf4 and h5netcdf engines to avoid issues with concurrent access when using dask's multithreaded backend. + data_vars : {'minimal', 'different', 'all' or list of str}, optional + These data variables will be concatenated together: + * 'minimal': Only data variables in which the dimension already + appears are included. + * 'different': Data variables which are not equal (ignoring + attributes) across all datasets are also concatenated (as well as + all for which dimension already appears). Beware: this option may + load the data payload of data variables into memory if they are not + already loaded. + * 'all': All data variables will be concatenated. + * list of str: The listed data variables will be concatenated, in + addition to the 'minimal' data variables. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. @@ -517,9 +529,9 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, datasets = [preprocess(ds) for ds in datasets] if concat_dim is _CONCAT_DIM_DEFAULT: - combined = auto_combine(datasets, compat=compat) + combined = auto_combine(datasets, compat=compat, data_vars=data_vars) else: - combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat) + combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, data_vars=data_vars) combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 04b46a6624b..f9d8b0a2b61 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -309,7 +309,7 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat, return arrays[0]._from_temp_dataset(ds, name) -def _auto_concat(datasets, dim=None): +def _auto_concat(datasets, dim=None, data_vars="all"): if len(datasets) == 1: return datasets[0] else: @@ -331,7 +331,7 @@ def _auto_concat(datasets, dim=None): 'supply the ``concat_dim`` argument ' 'explicitly') dim, = concat_dims - return concat(datasets, dim=dim) + return concat(datasets, dim=dim, data_vars=data_vars) _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' @@ -339,7 +339,7 @@ def _auto_concat(datasets, dim=None): def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts'): + compat='no_conflicts', data_vars="all"): """Attempt to auto-magically combine the given datasets into one. This method attempts to combine a list of datasets into a single entity by @@ -380,6 +380,8 @@ def auto_combine(datasets, - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + data_vars : {'minimal', 'different', 'all' or list of str}, optional + Details in the documentation of xarray.concat Returns ------- From 6195fcd03f6dd594cc252108b903947a71c32c1c Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Tue, 19 Sep 2017 12:20:03 -0400 Subject: [PATCH 02/29] use single quotes --- xarray/core/combine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index f9d8b0a2b61..110f046cb7c 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -309,7 +309,7 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat, return arrays[0]._from_temp_dataset(ds, name) -def _auto_concat(datasets, dim=None, data_vars="all"): +def _auto_concat(datasets, dim=None, data_vars='all'): if len(datasets) == 1: return datasets[0] else: @@ -339,7 +339,7 @@ def _auto_concat(datasets, dim=None, data_vars="all"): def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', data_vars="all"): + compat='no_conflicts', data_vars='all'): """Attempt to auto-magically combine the given datasets into one. This method attempts to combine a list of datasets into a single entity by From 956fbeb6386fa0a75713ca09afa483382fcdfc9a Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Tue, 19 Sep 2017 15:49:08 -0400 Subject: [PATCH 03/29] fix the 'line too long' warning from flake8 --- xarray/backends/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 82a26d0e4e0..89ae6591a37 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -531,7 +531,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, if concat_dim is _CONCAT_DIM_DEFAULT: combined = auto_combine(datasets, compat=compat, data_vars=data_vars) else: - combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, data_vars=data_vars) + combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, + data_vars=data_vars) combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs From e721620ed82e994b0303b0faca61068c57346f81 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Tue, 19 Sep 2017 16:00:09 -0400 Subject: [PATCH 04/29] document the data_vars keyword for open_mfdataset --- doc/whats-new.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 99986e0beb8..086d87949d5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,6 +38,17 @@ Backward Incompatible Changes Enhancements ~~~~~~~~~~~~ +- Support for data_vars keyword added to + py:func:`~xarray.open_mfdataset` + (:issue:`438`): + .. ipython:: + :verbatim: + #allows to open multiple files as + In [8]: ds = xarray.open_mfdataset(paths, chunks={"time": 100}, data_vars="minimal", dim="time") + #instead of + In [8]: ds = xarray.concat([xarray.open_dataset(p, chunks={"time": 100}) for p in paths], data_vars="minimal", dim="time") + + By `Huziy Oleksandr `_. - Support for `pathlib.Path` objects added to :py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`, From 34b10044d6f2164c3358ff145e95f93e717af756 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Tue, 19 Sep 2017 16:05:06 -0400 Subject: [PATCH 05/29] improve the data_vars record in whats-new --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 086d87949d5..f255f238b2a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -41,12 +41,14 @@ Enhancements - Support for data_vars keyword added to py:func:`~xarray.open_mfdataset` (:issue:`438`): + .. ipython:: :verbatim: #allows to open multiple files as In [8]: ds = xarray.open_mfdataset(paths, chunks={"time": 100}, data_vars="minimal", dim="time") #instead of In [8]: ds = xarray.concat([xarray.open_dataset(p, chunks={"time": 100}) for p in paths], data_vars="minimal", dim="time") + # in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat) By `Huziy Oleksandr `_. From 09d25c610595cc3389a35cb614d5bb38f0951af6 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Tue, 19 Sep 2017 16:34:45 -0400 Subject: [PATCH 06/29] update my name in wats-new.rst --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f255f238b2a..0971bc6546e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -50,7 +50,7 @@ Enhancements In [8]: ds = xarray.concat([xarray.open_dataset(p, chunks={"time": 100}) for p in paths], data_vars="minimal", dim="time") # in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat) - By `Huziy Oleksandr `_. + By `Oleksandr Huziy `_. - Support for `pathlib.Path` objects added to :py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`, From e901a3712033a392702088d5465823ec84cddd88 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Tue, 19 Sep 2017 17:20:45 -0400 Subject: [PATCH 07/29] Start writing the test for the data_vars keyword --- xarray/tests/test_backends.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a977868c7e6..e00606d7336 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1267,6 +1267,19 @@ def test_4_open_large_num_files_h5netcdf(self): self.validate_open_mfdataset_large_num_files(engine=['h5netcdf']) +class OpenMFDatasetDataVarsKWTest(TestCase): + def create_files_with_common_coordinates_and_time(self): + # TODO: implement + + pass + + def test_common_coordinate_dimensions_should_not_change_when_datavars_all(self): + #TODO: implement + assert False + pass + + + @requires_dask @requires_scipy @requires_netCDF4 From 3141ce464110e64e5f9e3ac1d3f2e02b809588e0 Mon Sep 17 00:00:00 2001 From: huziy Date: Tue, 19 Sep 2017 23:55:21 -0400 Subject: [PATCH 08/29] use the data_vars keyword in combine --- xarray/core/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 110f046cb7c..dd802b45d79 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -397,7 +397,7 @@ def auto_combine(datasets, dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), datasets).values() - concatenated = [_auto_concat(ds, dim=dim) for ds in grouped] + concatenated = [_auto_concat(ds, dim=dim, data_vars=data_vars) for ds in grouped] else: concatenated = datasets merged = merge(concatenated, compat=compat) From 8319aa74da6e48ce4a1751a43907eef011059609 Mon Sep 17 00:00:00 2001 From: huziy Date: Wed, 20 Sep 2017 00:20:11 -0400 Subject: [PATCH 09/29] address flake8 warnings for test_backend.py --- doc/whats-new.rst | 8 +-- xarray/core/combine.py | 3 +- xarray/tests/test_backends.py | 99 +++++++++++++++++++++++++++++++---- 3 files changed, 95 insertions(+), 15 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0971bc6546e..c2a7099953b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,15 +39,15 @@ Backward Incompatible Changes Enhancements ~~~~~~~~~~~~ - Support for data_vars keyword added to - py:func:`~xarray.open_mfdataset` - (:issue:`438`): + py: func: `~xarray.open_mfdataset` + (:issue: `438`): .. ipython:: :verbatim: #allows to open multiple files as - In [8]: ds = xarray.open_mfdataset(paths, chunks={"time": 100}, data_vars="minimal", dim="time") + ds = xarray.open_mfdataset(paths, chunks={"time": 100}, data_vars="minimal", dim="time") #instead of - In [8]: ds = xarray.concat([xarray.open_dataset(p, chunks={"time": 100}) for p in paths], data_vars="minimal", dim="time") + ds = xarray.concat([xarray.open_dataset(p, chunks={"time": 100}) for p in paths], data_vars="minimal", dim="time") # in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat) By `Oleksandr Huziy `_. diff --git a/xarray/core/combine.py b/xarray/core/combine.py index dd802b45d79..f2f2e86fe3e 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -397,7 +397,8 @@ def auto_combine(datasets, dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), datasets).values() - concatenated = [_auto_concat(ds, dim=dim, data_vars=data_vars) for ds in grouped] + concatenated = [_auto_concat(ds, dim=dim, data_vars=data_vars) + for ds in grouped] else: concatenated = datasets merged = merge(concatenated, compat=compat) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e00606d7336..a9458eb15db 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1268,16 +1268,95 @@ def test_4_open_large_num_files_h5netcdf(self): class OpenMFDatasetDataVarsKWTest(TestCase): - def create_files_with_common_coordinates_and_time(self): - # TODO: implement - - pass - - def test_common_coordinate_dimensions_should_not_change_when_datavars_all(self): - #TODO: implement - assert False - pass - + def gen_datasets_with_common_coord_and_time(self): + # create coordinate data + nx = 10 + nt = 10 + x = np.arange(nx) + t1 = np.arange(nt) + t2 = np.arange(nt, 2 * nt, 1) + + v1 = np.random.randn(nt, nx) + v2 = np.random.randn(nt, nx) + + ds1 = Dataset(data_vars={'v1': (['t', 'x'], v1), 'lon': ('x', 2 * x)}, + coords={ + 't': (['t', ], t1), + 'x': (['x', ], x) + }) + + ds2 = Dataset(data_vars={'v1': (['t', 'x'], v2), 'lon': ('x', 2 * x)}, + coords={ + 't': (['t', ], t2), + 'x': (['x', ], x) + }) + + return ds1, ds2 + + def test_open_mfdataset_does_same_as_concat(self): + with create_tmp_file() as tmpfile1: + with create_tmp_file() as tmpfile2: + ds1, ds2 = self.gen_datasets_with_common_coord_and_time() + + # save data to the temporary files + ds1.to_netcdf(tmpfile1) + ds2.to_netcdf(tmpfile2) + + for opt in ['all', 'minimal']: + ds = open_mfdataset([tmpfile1, tmpfile2], data_vars=opt) + ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t") + + self.assertArrayEqual(ds["v1"][:], ds_expect["v1"][:]) + self.assertArrayEqual(ds["lon"][:], ds_expect["lon"][:]) + + ds.close() + + def test_common_coord_dims_should_change_when_datavars_all(self): + with create_tmp_file() as tmpfile1: + with create_tmp_file() as tmpfile2: + ds1, ds2 = self.gen_datasets_with_common_coord_and_time() + + # save data to the temporary files + ds1.to_netcdf(tmpfile1) + ds2.to_netcdf(tmpfile2) + + # open the files with the default data_vars='all' + ds = open_mfdataset([tmpfile1, tmpfile2], data_vars='all') + + self.assertNotEqual(ds1['lon'].shape, ds['lon'].shape) + self.assertNotEqual(ds2['lon'].shape, ds['lon'].shape) + self.assertEqual(ds['v1'].shape[0], + ds1['v1'].shape[0] + ds2['v1'].shape[0]) + self.assertEqual(ds['v1'].shape, ds['lon'].shape) + + def test_common_coord_dims_should_not_change_when_datavars_minimal(self): + with create_tmp_file() as tmpfile1: + with create_tmp_file() as tmpfile2: + ds1, ds2 = self.gen_datasets_with_common_coord_and_time() + + # save data to the temporary files + ds1.to_netcdf(tmpfile1) + ds2.to_netcdf(tmpfile2) + + # open the files with the default data_vars='all' + ds = open_mfdataset([tmpfile1, tmpfile2], data_vars='minimal') + + self.assertEqual(ds1['lon'].shape, ds['lon'].shape) + self.assertEqual(ds2['lon'].shape, ds['lon'].shape) + self.assertEqual(ds['v1'].shape[0], + ds1['v1'].shape[0] + ds2['v1'].shape[0]) + + def test_invalid_data_vars_value_should_fail(self): + with self.assertRaises(ValueError): + with create_tmp_file() as tmpfile1: + with create_tmp_file() as tmpfile2: + ds1, ds2 = self.gen_datasets_with_common_coord_and_time() + + # save data to the temporary files + ds1.to_netcdf(tmpfile1) + ds2.to_netcdf(tmpfile2) + + open_mfdataset([tmpfile1, tmpfile2], data_vars='minimum') @requires_dask From fdc940e0cce8e5db8d83e418d1a666a09ecb384c Mon Sep 17 00:00:00 2001 From: huziy Date: Wed, 20 Sep 2017 00:25:44 -0400 Subject: [PATCH 10/29] ignore flake8 warnings concerning whats-new.rst --- doc/whats-new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c2a7099953b..bdf7a97b96c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,8 +39,8 @@ Backward Incompatible Changes Enhancements ~~~~~~~~~~~~ - Support for data_vars keyword added to - py: func: `~xarray.open_mfdataset` - (:issue: `438`): + py:func:`~xarray.open_mfdataset` + (:issue:`438`): .. ipython:: :verbatim: From 96e842ec86ede7743e1e6060c22b983bd6296a54 Mon Sep 17 00:00:00 2001 From: huziy Date: Wed, 20 Sep 2017 00:31:41 -0400 Subject: [PATCH 11/29] fix function reference in whats-new.rst --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index bdf7a97b96c..fc8187a101e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,7 +39,7 @@ Backward Incompatible Changes Enhancements ~~~~~~~~~~~~ - Support for data_vars keyword added to - py:func:`~xarray.open_mfdataset` + :py:func:`~xarray.open_mfdataset` (:issue:`438`): .. ipython:: From b033bec0171e5e96c1ecc7fd24bed9712fe10dd3 Mon Sep 17 00:00:00 2001 From: huziy Date: Wed, 20 Sep 2017 00:36:07 -0400 Subject: [PATCH 12/29] open_mfdataset does not accept dim keyword argument --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index fc8187a101e..4e9633b585c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -45,7 +45,7 @@ Enhancements .. ipython:: :verbatim: #allows to open multiple files as - ds = xarray.open_mfdataset(paths, chunks={"time": 100}, data_vars="minimal", dim="time") + ds = xarray.open_mfdataset(paths, chunks={"time": 100}, data_vars="minimal") #instead of ds = xarray.concat([xarray.open_dataset(p, chunks={"time": 100}) for p in paths], data_vars="minimal", dim="time") # in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat) From b854ce4ae4ee6ad9d8c559411c3e7d2c2666a8d6 Mon Sep 17 00:00:00 2001 From: huziy Date: Wed, 20 Sep 2017 00:37:50 -0400 Subject: [PATCH 13/29] use single quotes for strings in the added tests --- xarray/tests/test_backends.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a9458eb15db..b208a7e0746 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1304,10 +1304,10 @@ def test_open_mfdataset_does_same_as_concat(self): for opt in ['all', 'minimal']: ds = open_mfdataset([tmpfile1, tmpfile2], data_vars=opt) - ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t") + ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim='t') - self.assertArrayEqual(ds["v1"][:], ds_expect["v1"][:]) - self.assertArrayEqual(ds["lon"][:], ds_expect["lon"][:]) + self.assertArrayEqual(ds['v1'][:], ds_expect['v1'][:]) + self.assertArrayEqual(ds['lon'][:], ds_expect['lon'][:]) ds.close() From 787a98b80480e203106451200289c5232dd7ce0e Mon Sep 17 00:00:00 2001 From: huziy Date: Wed, 20 Sep 2017 01:05:20 -0400 Subject: [PATCH 14/29] refactor data_vars related tests --- xarray/tests/test_backends.py | 56 +++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b208a7e0746..3fa2077a7b8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1268,6 +1268,9 @@ def test_4_open_large_num_files_h5netcdf(self): class OpenMFDatasetDataVarsKWTest(TestCase): + coord_name = 'lon' + var_name = 'v1' + def gen_datasets_with_common_coord_and_time(self): # create coordinate data nx = 10 @@ -1279,13 +1282,15 @@ def gen_datasets_with_common_coord_and_time(self): v1 = np.random.randn(nt, nx) v2 = np.random.randn(nt, nx) - ds1 = Dataset(data_vars={'v1': (['t', 'x'], v1), 'lon': ('x', 2 * x)}, + ds1 = Dataset(data_vars={self.var_name: (['t', 'x'], v1), + self.coord_name: ('x', 2 * x)}, coords={ 't': (['t', ], t1), 'x': (['x', ], x) }) - ds2 = Dataset(data_vars={'v1': (['t', 'x'], v2), 'lon': ('x', 2 * x)}, + ds2 = Dataset(data_vars={self.var_name: (['t', 'x'], v2), + self.coord_name: ('x', 2 * x)}, coords={ 't': (['t', ], t2), 'x': (['x', ], x) @@ -1306,8 +1311,14 @@ def test_open_mfdataset_does_same_as_concat(self): ds = open_mfdataset([tmpfile1, tmpfile2], data_vars=opt) ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim='t') - self.assertArrayEqual(ds['v1'][:], ds_expect['v1'][:]) - self.assertArrayEqual(ds['lon'][:], ds_expect['lon'][:]) + data = ds[self.var_name][:] + data_expect = ds_expect[self.var_name][:] + + coord = ds[self.coord_name][:] + coord_expect = ds_expect[self.coord_name][:] + + self.assertArrayEqual(data, data_expect) + self.assertArrayEqual(coord, coord_expect) ds.close() @@ -1323,11 +1334,21 @@ def test_common_coord_dims_should_change_when_datavars_all(self): # open the files with the default data_vars='all' ds = open_mfdataset([tmpfile1, tmpfile2], data_vars='all') - self.assertNotEqual(ds1['lon'].shape, ds['lon'].shape) - self.assertNotEqual(ds2['lon'].shape, ds['lon'].shape) - self.assertEqual(ds['v1'].shape[0], - ds1['v1'].shape[0] + ds2['v1'].shape[0]) - self.assertEqual(ds['v1'].shape, ds['lon'].shape) + coord_shape = ds[self.coord_name].shape + coord_shape1 = ds1[self.coord_name].shape + coord_shape2 = ds2[self.coord_name].shape + + var_shape = ds[self.var_name].shape + var_shape1 = ds1[self.var_name].shape + var_shape2 = ds2[self.var_name].shape + + self.assertNotEqual(coord_shape1, coord_shape) + self.assertNotEqual(coord_shape2, coord_shape) + + self.assertEqual(var_shape[0], + var_shape1[0] + var_shape2[0]) + + self.assertEqual(var_shape, coord_shape) def test_common_coord_dims_should_not_change_when_datavars_minimal(self): with create_tmp_file() as tmpfile1: @@ -1341,10 +1362,19 @@ def test_common_coord_dims_should_not_change_when_datavars_minimal(self): # open the files with the default data_vars='all' ds = open_mfdataset([tmpfile1, tmpfile2], data_vars='minimal') - self.assertEqual(ds1['lon'].shape, ds['lon'].shape) - self.assertEqual(ds2['lon'].shape, ds['lon'].shape) - self.assertEqual(ds['v1'].shape[0], - ds1['v1'].shape[0] + ds2['v1'].shape[0]) + coord_shape = ds[self.coord_name].shape + coord_shape1 = ds1[self.coord_name].shape + coord_shape2 = ds2[self.coord_name].shape + + var_shape = ds[self.var_name].shape + var_shape1 = ds1[self.var_name].shape + var_shape2 = ds2[self.var_name].shape + + self.assertEqual(coord_shape1, coord_shape) + + self.assertEqual(coord_shape2, coord_shape) + self.assertEqual(var_shape[0], + var_shape1[0] + var_shape2[0]) def test_invalid_data_vars_value_should_fail(self): with self.assertRaises(ValueError): From 4d3c6850e22ae6b6fdbbfd9a76d9f4e16555822a Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Wed, 20 Sep 2017 09:05:03 -0400 Subject: [PATCH 15/29] Use with for opening mfdataset in data_vars related tests --- xarray/tests/test_backends.py | 67 +++++++++++++++++------------------ 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3fa2077a7b8..ffa8a1a15cc 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1308,19 +1308,17 @@ def test_open_mfdataset_does_same_as_concat(self): ds2.to_netcdf(tmpfile2) for opt in ['all', 'minimal']: - ds = open_mfdataset([tmpfile1, tmpfile2], data_vars=opt) - ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim='t') + with open_mfdataset([tmpfile1, tmpfile2], data_vars=opt) as ds: + ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim='t') - data = ds[self.var_name][:] - data_expect = ds_expect[self.var_name][:] + data = ds[self.var_name][:] + data_expect = ds_expect[self.var_name][:] - coord = ds[self.coord_name][:] - coord_expect = ds_expect[self.coord_name][:] + coord = ds[self.coord_name][:] + coord_expect = ds_expect[self.coord_name][:] - self.assertArrayEqual(data, data_expect) - self.assertArrayEqual(coord, coord_expect) - - ds.close() + self.assertArrayEqual(data, data_expect) + self.assertArrayEqual(coord, coord_expect) def test_common_coord_dims_should_change_when_datavars_all(self): with create_tmp_file() as tmpfile1: @@ -1332,23 +1330,23 @@ def test_common_coord_dims_should_change_when_datavars_all(self): ds2.to_netcdf(tmpfile2) # open the files with the default data_vars='all' - ds = open_mfdataset([tmpfile1, tmpfile2], data_vars='all') + with open_mfdataset([tmpfile1, tmpfile2], data_vars='all') as ds: - coord_shape = ds[self.coord_name].shape - coord_shape1 = ds1[self.coord_name].shape - coord_shape2 = ds2[self.coord_name].shape + coord_shape = ds[self.coord_name].shape + coord_shape1 = ds1[self.coord_name].shape + coord_shape2 = ds2[self.coord_name].shape - var_shape = ds[self.var_name].shape - var_shape1 = ds1[self.var_name].shape - var_shape2 = ds2[self.var_name].shape + var_shape = ds[self.var_name].shape + var_shape1 = ds1[self.var_name].shape + var_shape2 = ds2[self.var_name].shape - self.assertNotEqual(coord_shape1, coord_shape) - self.assertNotEqual(coord_shape2, coord_shape) + self.assertNotEqual(coord_shape1, coord_shape) + self.assertNotEqual(coord_shape2, coord_shape) - self.assertEqual(var_shape[0], - var_shape1[0] + var_shape2[0]) + self.assertEqual(var_shape[0], + var_shape1[0] + var_shape2[0]) - self.assertEqual(var_shape, coord_shape) + self.assertEqual(var_shape, coord_shape) def test_common_coord_dims_should_not_change_when_datavars_minimal(self): with create_tmp_file() as tmpfile1: @@ -1360,21 +1358,21 @@ def test_common_coord_dims_should_not_change_when_datavars_minimal(self): ds2.to_netcdf(tmpfile2) # open the files with the default data_vars='all' - ds = open_mfdataset([tmpfile1, tmpfile2], data_vars='minimal') + with open_mfdataset([tmpfile1, tmpfile2], data_vars='minimal') as ds: - coord_shape = ds[self.coord_name].shape - coord_shape1 = ds1[self.coord_name].shape - coord_shape2 = ds2[self.coord_name].shape + coord_shape = ds[self.coord_name].shape + coord_shape1 = ds1[self.coord_name].shape + coord_shape2 = ds2[self.coord_name].shape - var_shape = ds[self.var_name].shape - var_shape1 = ds1[self.var_name].shape - var_shape2 = ds2[self.var_name].shape + var_shape = ds[self.var_name].shape + var_shape1 = ds1[self.var_name].shape + var_shape2 = ds2[self.var_name].shape - self.assertEqual(coord_shape1, coord_shape) + self.assertEqual(coord_shape1, coord_shape) - self.assertEqual(coord_shape2, coord_shape) - self.assertEqual(var_shape[0], - var_shape1[0] + var_shape2[0]) + self.assertEqual(coord_shape2, coord_shape) + self.assertEqual(var_shape[0], + var_shape1[0] + var_shape2[0]) def test_invalid_data_vars_value_should_fail(self): with self.assertRaises(ValueError): @@ -1386,7 +1384,8 @@ def test_invalid_data_vars_value_should_fail(self): ds1.to_netcdf(tmpfile1) ds2.to_netcdf(tmpfile2) - open_mfdataset([tmpfile1, tmpfile2], data_vars='minimum') + with open_mfdataset([tmpfile1, tmpfile2], data_vars='minimum'): + pass @requires_dask From 1823ba330a2aa41b487532cef0d1b641e409128f Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Wed, 20 Sep 2017 09:12:47 -0400 Subject: [PATCH 16/29] add @requires_scipy_or_netCDF4 to the data_vars test class --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ffa8a1a15cc..3d96a416ffc 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1266,7 +1266,7 @@ def test_3_open_large_num_files_pynio(self): def test_4_open_large_num_files_h5netcdf(self): self.validate_open_mfdataset_large_num_files(engine=['h5netcdf']) - +@requires_scipy_or_netCDF4 class OpenMFDatasetDataVarsKWTest(TestCase): coord_name = 'lon' var_name = 'v1' From b47e665867a831c27ae0eaf9adb76c1f74b8e3ce Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Wed, 20 Sep 2017 09:25:07 -0400 Subject: [PATCH 17/29] address flake8 warnings about long lines in the data_vars related tests. --- xarray/tests/test_backends.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3d96a416ffc..ad3e669504b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1266,6 +1266,7 @@ def test_3_open_large_num_files_pynio(self): def test_4_open_large_num_files_h5netcdf(self): self.validate_open_mfdataset_large_num_files(engine=['h5netcdf']) + @requires_scipy_or_netCDF4 class OpenMFDatasetDataVarsKWTest(TestCase): coord_name = 'lon' @@ -1307,9 +1308,11 @@ def test_open_mfdataset_does_same_as_concat(self): ds1.to_netcdf(tmpfile1) ds2.to_netcdf(tmpfile2) + files = [tmpfile1, tmpfile2] for opt in ['all', 'minimal']: - with open_mfdataset([tmpfile1, tmpfile2], data_vars=opt) as ds: - ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim='t') + with open_mfdataset(files, data_vars=opt) as ds: + kwargs = dict(data_vars=opt, dim='t') + ds_expect = xr.concat([ds1, ds2], **kwargs) data = ds[self.var_name][:] data_expect = ds_expect[self.var_name][:] @@ -1329,8 +1332,9 @@ def test_common_coord_dims_should_change_when_datavars_all(self): ds1.to_netcdf(tmpfile1) ds2.to_netcdf(tmpfile2) + files = [tmpfile1, tmpfile2] # open the files with the default data_vars='all' - with open_mfdataset([tmpfile1, tmpfile2], data_vars='all') as ds: + with open_mfdataset(files, data_vars='all') as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -1357,8 +1361,9 @@ def test_common_coord_dims_should_not_change_when_datavars_minimal(self): ds1.to_netcdf(tmpfile1) ds2.to_netcdf(tmpfile2) + files = [tmpfile1, tmpfile2] # open the files with the default data_vars='all' - with open_mfdataset([tmpfile1, tmpfile2], data_vars='minimal') as ds: + with open_mfdataset(files, data_vars='minimal') as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -1384,7 +1389,8 @@ def test_invalid_data_vars_value_should_fail(self): ds1.to_netcdf(tmpfile1) ds2.to_netcdf(tmpfile2) - with open_mfdataset([tmpfile1, tmpfile2], data_vars='minimum'): + files = [tmpfile1, tmpfile2] + with open_mfdataset(files, data_vars='minimum'): pass From 23f0fc692177ee8e473c9110b768502721b73686 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Wed, 20 Sep 2017 11:20:08 -0400 Subject: [PATCH 18/29] close opened datasets in case of a ValueError in open_mfdataset, seems important for Windows --- xarray/backends/api.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 89ae6591a37..15df23a95cd 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -528,13 +528,19 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] - if concat_dim is _CONCAT_DIM_DEFAULT: - combined = auto_combine(datasets, compat=compat, data_vars=data_vars) - else: - combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, - data_vars=data_vars) - combined._file_obj = _MultiFileCloser(file_objs) - combined.attrs = datasets[0].attrs + # close datasets in case of a ValueError + try: + if concat_dim is _CONCAT_DIM_DEFAULT: + combined = auto_combine(datasets, compat=compat, data_vars=data_vars) + else: + combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, + data_vars=data_vars) + combined._file_obj = _MultiFileCloser(file_objs) + combined.attrs = datasets[0].attrs + except ValueError as ve: + for ds in datasets: + ds.close() + raise ve return combined From 05c8391afd7e4aec28b74908c89ac3736df4c1ac Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Wed, 20 Sep 2017 15:53:38 -0400 Subject: [PATCH 19/29] fix line too long warnings from flake8 --- xarray/backends/api.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 15df23a95cd..6346efcefd5 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -531,10 +531,11 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, # close datasets in case of a ValueError try: if concat_dim is _CONCAT_DIM_DEFAULT: - combined = auto_combine(datasets, compat=compat, data_vars=data_vars) - else: - combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, + combined = auto_combine(datasets, compat=compat, data_vars=data_vars) + else: + combined = auto_combine(datasets, concat_dim=concat_dim, + compat=compat, data_vars=data_vars) combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs except ValueError as ve: From 1f0e7631b6250b55ba7aadf699919b3666097dcf Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Thu, 21 Sep 2017 12:57:13 -0400 Subject: [PATCH 20/29] refactor tests and open_mfdataset, to address comments --- xarray/backends/api.py | 11 ++++-- xarray/tests/test_backends.py | 71 ++++++++++------------------------- 2 files changed, 26 insertions(+), 56 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 6346efcefd5..7fbc073400e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -536,12 +536,15 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, else: combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, data_vars=data_vars) - combined._file_obj = _MultiFileCloser(file_objs) - combined.attrs = datasets[0].attrs - except ValueError as ve: + except ValueError: for ds in datasets: ds.close() - raise ve + raise + + combined._file_obj = _MultiFileCloser(file_objs) + combined.attrs = datasets[0].attrs + + return combined diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ad3e669504b..527bc0cabd1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1313,17 +1313,9 @@ def test_open_mfdataset_does_same_as_concat(self): with open_mfdataset(files, data_vars=opt) as ds: kwargs = dict(data_vars=opt, dim='t') ds_expect = xr.concat([ds1, ds2], **kwargs) + self.assertDatasetIdentical(ds, ds_expect) - data = ds[self.var_name][:] - data_expect = ds_expect[self.var_name][:] - - coord = ds[self.coord_name][:] - coord_expect = ds_expect[self.coord_name][:] - - self.assertArrayEqual(data, data_expect) - self.assertArrayEqual(coord, coord_expect) - - def test_common_coord_dims_should_change_when_datavars_all(self): + def test_common_coord_when_datavars_passed(self): with create_tmp_file() as tmpfile1: with create_tmp_file() as tmpfile2: ds1, ds2 = self.gen_datasets_with_common_coord_and_time() @@ -1333,26 +1325,28 @@ def test_common_coord_dims_should_change_when_datavars_all(self): ds2.to_netcdf(tmpfile2) files = [tmpfile1, tmpfile2] - # open the files with the default data_vars='all' - with open_mfdataset(files, data_vars='all') as ds: - coord_shape = ds[self.coord_name].shape - coord_shape1 = ds1[self.coord_name].shape - coord_shape2 = ds2[self.coord_name].shape + for opt in ['all', 'minimal']: + # open the files with the default data_vars='all' + with open_mfdataset(files, data_vars=opt) as ds: + + coord_shape = ds[self.coord_name].shape + coord_shape1 = ds1[self.coord_name].shape + coord_shape2 = ds2[self.coord_name].shape - var_shape = ds[self.var_name].shape - var_shape1 = ds1[self.var_name].shape - var_shape2 = ds2[self.var_name].shape + var_shape = ds[self.var_name].shape - self.assertNotEqual(coord_shape1, coord_shape) - self.assertNotEqual(coord_shape2, coord_shape) + if opt == 'all': + self.assertEqual(var_shape, coord_shape) + self.assertNotEqual(coord_shape1, coord_shape) + self.assertNotEqual(coord_shape2, coord_shape) - self.assertEqual(var_shape[0], - var_shape1[0] + var_shape2[0]) + if opt == 'minimal': + self.assertEqual(coord_shape1, coord_shape) + self.assertEqual(coord_shape2, coord_shape) - self.assertEqual(var_shape, coord_shape) - def test_common_coord_dims_should_not_change_when_datavars_minimal(self): + def test_invalid_data_vars_value_should_fail(self): with create_tmp_file() as tmpfile1: with create_tmp_file() as tmpfile2: ds1, ds2 = self.gen_datasets_with_common_coord_and_time() @@ -1362,34 +1356,7 @@ def test_common_coord_dims_should_not_change_when_datavars_minimal(self): ds2.to_netcdf(tmpfile2) files = [tmpfile1, tmpfile2] - # open the files with the default data_vars='all' - with open_mfdataset(files, data_vars='minimal') as ds: - - coord_shape = ds[self.coord_name].shape - coord_shape1 = ds1[self.coord_name].shape - coord_shape2 = ds2[self.coord_name].shape - - var_shape = ds[self.var_name].shape - var_shape1 = ds1[self.var_name].shape - var_shape2 = ds2[self.var_name].shape - - self.assertEqual(coord_shape1, coord_shape) - - self.assertEqual(coord_shape2, coord_shape) - self.assertEqual(var_shape[0], - var_shape1[0] + var_shape2[0]) - - def test_invalid_data_vars_value_should_fail(self): - with self.assertRaises(ValueError): - with create_tmp_file() as tmpfile1: - with create_tmp_file() as tmpfile2: - ds1, ds2 = self.gen_datasets_with_common_coord_and_time() - - # save data to the temporary files - ds1.to_netcdf(tmpfile1) - ds2.to_netcdf(tmpfile2) - - files = [tmpfile1, tmpfile2] + with self.assertRaises(ValueError): with open_mfdataset(files, data_vars='minimum'): pass From fadda83469e6159cfc4cfec42fc4627ef02ced94 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Thu, 21 Sep 2017 14:31:32 -0400 Subject: [PATCH 21/29] refactor tests for data_vars keyword in open_mfdataset --- xarray/tests/test_backends.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 527bc0cabd1..73cb8f5ea44 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1336,15 +1336,24 @@ def test_common_coord_when_datavars_passed(self): var_shape = ds[self.var_name].shape + tests = [] + # shape pairs to be compared + shape_pairs = [ + (var_shape, coord_shape), + (coord_shape1, coord_shape), + (coord_shape2, coord_shape) + ] + # tests to be applied to respective pairs if opt == 'all': - self.assertEqual(var_shape, coord_shape) - self.assertNotEqual(coord_shape1, coord_shape) - self.assertNotEqual(coord_shape2, coord_shape) + tests = [self.assertEqual, + self.assertNotEqual, self.assertNotEqual] if opt == 'minimal': - self.assertEqual(coord_shape1, coord_shape) - self.assertEqual(coord_shape2, coord_shape) + tests = [self.assertNotEqual, + self.assertEqual, self.assertEqual] + for a_test, a_shape_pair in zip(tests, shape_pairs): + a_test(*a_shape_pair) def test_invalid_data_vars_value_should_fail(self): with create_tmp_file() as tmpfile1: From f80fe1f35c3ad99fa140a0984c2d8eee13d19317 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Thu, 21 Sep 2017 14:32:07 -0400 Subject: [PATCH 22/29] refactor to address flake8 warnings --- xarray/backends/api.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 7fbc073400e..137e8b1897e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -543,9 +543,6 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs - - - return combined From 14dee9d912f5d4fc5298174f5b22e5d3321c4880 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Thu, 21 Sep 2017 15:01:11 -0400 Subject: [PATCH 23/29] add another example of data_vars usage in open_mfdataset --- doc/whats-new.rst | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4e9633b585c..b5e2f35ae3d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,7 +38,7 @@ Backward Incompatible Changes Enhancements ~~~~~~~~~~~~ -- Support for data_vars keyword added to +- Support for data_vars and coords keywords added to :py:func:`~xarray.open_mfdataset` (:issue:`438`): @@ -50,6 +50,19 @@ Enhancements ds = xarray.concat([xarray.open_dataset(p, chunks={"time": 100}) for p in paths], data_vars="minimal", dim="time") # in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat) + # in case of 'minimal' does not add time dimension to spatial coordinates + In [1]: ds = xarray.open_mfdataset("daymet_v3_tmin_*", data_vars="all") + + In [2]: ds["lon"].shape + + Out[2]: (13505, 808, 782) + + In [3]: ds = xarray.open_mfdataset("daymet_v3_tmin_*", data_vars="minimal") + + In [4]: ds["lon"].shape + + Out[4]: (808, 782) + By `Oleksandr Huziy `_. - Support for `pathlib.Path` objects added to From f1f9d8baa9be7cec055168e76103b610bf5ba736 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Thu, 21 Sep 2017 15:01:42 -0400 Subject: [PATCH 24/29] add coords keyword to open_mfdataset --- xarray/backends/api.py | 21 ++++++++++++++++++--- xarray/core/combine.py | 16 ++++++++++------ 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 137e8b1897e..6bf0ed03af2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -431,7 +431,7 @@ def close(self): def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, - lock=None, data_vars='all', **kwargs): + lock=None, data_vars='all', coords='different', **kwargs): """Open multiple files as a single dataset. Requires dask to be installed. Attributes from the first dataset file @@ -499,6 +499,20 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, * 'all': All data variables will be concatenated. * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. + coords : {'minimal', 'different', 'all' o list of str}, optional + These coordinate variables will be concatenated together: + * 'minimal': Only coordinates in which the dimension already appears + are included. + * 'different': Coordinates which are not equal (ignoring attributes) + across all datasets are also concatenated (as well as all for which + dimension already appears). Beware: this option may load the data + payload of coordinate variables into memory if they are not already + loaded. + * 'all': All coordinate variables will be concatenated, except + those corresponding to other dimensions. + * list of str: The listed coordinate variables will be concatenated, + in addition the 'minimal' coordinates. + **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. @@ -532,10 +546,11 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, try: if concat_dim is _CONCAT_DIM_DEFAULT: combined = auto_combine(datasets, compat=compat, - data_vars=data_vars) + data_vars=data_vars, coords=coords) else: combined = auto_combine(datasets, concat_dim=concat_dim, - compat=compat, data_vars=data_vars) + compat=compat, + data_vars=data_vars, coords=coords) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index f2f2e86fe3e..ff320336244 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -309,7 +309,7 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat, return arrays[0]._from_temp_dataset(ds, name) -def _auto_concat(datasets, dim=None, data_vars='all'): +def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): if len(datasets) == 1: return datasets[0] else: @@ -331,7 +331,7 @@ def _auto_concat(datasets, dim=None, data_vars='all'): 'supply the ``concat_dim`` argument ' 'explicitly') dim, = concat_dims - return concat(datasets, dim=dim, data_vars=data_vars) + return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' @@ -339,7 +339,8 @@ def _auto_concat(datasets, dim=None, data_vars='all'): def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', data_vars='all'): + compat='no_conflicts', + data_vars='all', coords='different'): """Attempt to auto-magically combine the given datasets into one. This method attempts to combine a list of datasets into a single entity by @@ -380,8 +381,10 @@ def auto_combine(datasets, - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. - data_vars : {'minimal', 'different', 'all' or list of str}, optional - Details in the documentation of xarray.concat + data_vars : {'minimal', 'different', 'all' or list of str}, optional + Details are in the documentation of concat + coords : {'minimal', 'different', 'all' o list of str}, optional + Details are in the documentation of concat Returns ------- @@ -397,7 +400,8 @@ def auto_combine(datasets, dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), datasets).values() - concatenated = [_auto_concat(ds, dim=dim, data_vars=data_vars) + concatenated = [_auto_concat(ds, dim=dim, + data_vars=data_vars, coords=coords) for ds in grouped] else: concatenated = datasets From f64c9e377dcd8f147e8cbf538ca49db3f19cc7ee Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Thu, 21 Sep 2017 16:51:12 -0400 Subject: [PATCH 25/29] add a memory and performance related observations to the whats-new and modify code snippets to use single quotes for consistency. --- doc/whats-new.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b5e2f35ae3d..5653e9d3928 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -45,24 +45,26 @@ Enhancements .. ipython:: :verbatim: #allows to open multiple files as - ds = xarray.open_mfdataset(paths, chunks={"time": 100}, data_vars="minimal") + ds = xarray.open_mfdataset(paths, chunks={'time': 100}, data_vars='minimal') #instead of - ds = xarray.concat([xarray.open_dataset(p, chunks={"time": 100}) for p in paths], data_vars="minimal", dim="time") + ds = xarray.concat([xarray.open_dataset(p, chunks={'time': 100}) for p in paths], data_vars='minimal', dim='time') # in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat) # in case of 'minimal' does not add time dimension to spatial coordinates - In [1]: ds = xarray.open_mfdataset("daymet_v3_tmin_*", data_vars="all") + In [1]: ds = xarray.open_mfdataset('daymet_v3_tmin_*', data_vars='all') - In [2]: ds["lon"].shape + In [2]: ds['lon'].shape Out[2]: (13505, 808, 782) - In [3]: ds = xarray.open_mfdataset("daymet_v3_tmin_*", data_vars="minimal") + In [3]: ds = xarray.open_mfdataset('daymet_v3_tmin_*', data_vars='minimal') - In [4]: ds["lon"].shape + In [4]: ds['lon'].shape Out[4]: (808, 782) + # I also noticed that my memory-intensive applications use much less memory and faster, when ``data_vars='minimal'`` is used. + By `Oleksandr Huziy `_. - Support for `pathlib.Path` objects added to From 633eec37a987b6a485ada0d1025434a91bab4322 Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Thu, 21 Sep 2017 17:34:37 -0400 Subject: [PATCH 26/29] fixed a grammar mistake --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5653e9d3928..71edafa12ba 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -63,7 +63,7 @@ Enhancements Out[4]: (808, 782) - # I also noticed that my memory-intensive applications use much less memory and faster, when ``data_vars='minimal'`` is used. + # I also noticed that my memory-intensive applications use much less memory and run faster, when ``data_vars='minimal'`` is used. By `Oleksandr Huziy `_. From 086cf250f7e5d07f5de0811628e17100a505d41b Mon Sep 17 00:00:00 2001 From: Oleksandr Huziy Date: Thu, 21 Sep 2017 17:37:07 -0400 Subject: [PATCH 27/29] quote variable names referenced in the text --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 71edafa12ba..63c0c6a6502 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,7 +38,7 @@ Backward Incompatible Changes Enhancements ~~~~~~~~~~~~ -- Support for data_vars and coords keywords added to +- Support for ``data_vars`` and ``coords`` keywords added to :py:func:`~xarray.open_mfdataset` (:issue:`438`): From b0ca22803c33cea345c729f62c882f605a4160ae Mon Sep 17 00:00:00 2001 From: huziy Date: Thu, 21 Sep 2017 21:39:02 -0400 Subject: [PATCH 28/29] add tests for coords keyword in the open_mfdataset, along with the similar tests for the data_vars keyword. --- xarray/tests/test_backends.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 73cb8f5ea44..11e695c296f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1268,7 +1268,7 @@ def test_4_open_large_num_files_h5netcdf(self): @requires_scipy_or_netCDF4 -class OpenMFDatasetDataVarsKWTest(TestCase): +class OpenMFDatasetWithDataVarsAndCoordsKwTest(TestCase): coord_name = 'lon' var_name = 'v1' @@ -1309,12 +1309,19 @@ def test_open_mfdataset_does_same_as_concat(self): ds2.to_netcdf(tmpfile2) files = [tmpfile1, tmpfile2] - for opt in ['all', 'minimal']: + + options = ['all', 'minimal', 'different', ] + for opt in options: with open_mfdataset(files, data_vars=opt) as ds: kwargs = dict(data_vars=opt, dim='t') ds_expect = xr.concat([ds1, ds2], **kwargs) self.assertDatasetIdentical(ds, ds_expect) + with open_mfdataset(files, coords=opt) as ds: + kwargs = dict(coords=opt, dim='t') + ds_expect = xr.concat([ds1, ds2], **kwargs) + self.assertDatasetIdentical(ds, ds_expect) + def test_common_coord_when_datavars_passed(self): with create_tmp_file() as tmpfile1: with create_tmp_file() as tmpfile2: @@ -1369,6 +1376,11 @@ def test_invalid_data_vars_value_should_fail(self): with open_mfdataset(files, data_vars='minimum'): pass + # test invalid coord parameter + with self.assertRaises(ValueError): + with open_mfdataset(files, coords='minimum'): + pass + @requires_dask @requires_scipy From e463e371d695a0de31bcbe02b8b38fa751ce427b Mon Sep 17 00:00:00 2001 From: huziy Date: Fri, 22 Sep 2017 23:21:30 -0400 Subject: [PATCH 29/29] split a test into 2 to simplify, introduce context manager for setting up test inputs in OpenMFDatasetWithDataVarsAndCoordsKwTest --- xarray/tests/test_backends.py | 161 ++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 76 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 11e695c296f..cf08ff67671 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1272,6 +1272,18 @@ class OpenMFDatasetWithDataVarsAndCoordsKwTest(TestCase): coord_name = 'lon' var_name = 'v1' + @contextlib.contextmanager + def setup_files_and_datasets(self): + ds1, ds2 = self.gen_datasets_with_common_coord_and_time() + with create_tmp_file() as tmpfile1: + with create_tmp_file() as tmpfile2: + + # save data to the temporary files + ds1.to_netcdf(tmpfile1) + ds2.to_netcdf(tmpfile2) + + yield [tmpfile1, tmpfile2], [ds1, ds2] + def gen_datasets_with_common_coord_and_time(self): # create coordinate data nx = 10 @@ -1300,86 +1312,83 @@ def gen_datasets_with_common_coord_and_time(self): return ds1, ds2 def test_open_mfdataset_does_same_as_concat(self): - with create_tmp_file() as tmpfile1: - with create_tmp_file() as tmpfile2: - ds1, ds2 = self.gen_datasets_with_common_coord_and_time() - - # save data to the temporary files - ds1.to_netcdf(tmpfile1) - ds2.to_netcdf(tmpfile2) - - files = [tmpfile1, tmpfile2] - - options = ['all', 'minimal', 'different', ] - for opt in options: - with open_mfdataset(files, data_vars=opt) as ds: - kwargs = dict(data_vars=opt, dim='t') - ds_expect = xr.concat([ds1, ds2], **kwargs) - self.assertDatasetIdentical(ds, ds_expect) - - with open_mfdataset(files, coords=opt) as ds: - kwargs = dict(coords=opt, dim='t') - ds_expect = xr.concat([ds1, ds2], **kwargs) - self.assertDatasetIdentical(ds, ds_expect) - - def test_common_coord_when_datavars_passed(self): - with create_tmp_file() as tmpfile1: - with create_tmp_file() as tmpfile2: - ds1, ds2 = self.gen_datasets_with_common_coord_and_time() - - # save data to the temporary files - ds1.to_netcdf(tmpfile1) - ds2.to_netcdf(tmpfile2) - - files = [tmpfile1, tmpfile2] - - for opt in ['all', 'minimal']: - # open the files with the default data_vars='all' - with open_mfdataset(files, data_vars=opt) as ds: - - coord_shape = ds[self.coord_name].shape - coord_shape1 = ds1[self.coord_name].shape - coord_shape2 = ds2[self.coord_name].shape - - var_shape = ds[self.var_name].shape - - tests = [] - # shape pairs to be compared - shape_pairs = [ - (var_shape, coord_shape), - (coord_shape1, coord_shape), - (coord_shape2, coord_shape) - ] - # tests to be applied to respective pairs - if opt == 'all': - tests = [self.assertEqual, - self.assertNotEqual, self.assertNotEqual] - - if opt == 'minimal': - tests = [self.assertNotEqual, - self.assertEqual, self.assertEqual] - - for a_test, a_shape_pair in zip(tests, shape_pairs): - a_test(*a_shape_pair) + options = ['all', 'minimal', 'different', ] + + with self.setup_files_and_datasets() as (files, [ds1, ds2]): + for opt in options: + with open_mfdataset(files, data_vars=opt) as ds: + kwargs = dict(data_vars=opt, dim='t') + ds_expect = xr.concat([ds1, ds2], **kwargs) + self.assertDatasetIdentical(ds, ds_expect) + + with open_mfdataset(files, coords=opt) as ds: + kwargs = dict(coords=opt, dim='t') + ds_expect = xr.concat([ds1, ds2], **kwargs) + self.assertDatasetIdentical(ds, ds_expect) + + def test_common_coord_when_datavars_all(self): + opt = 'all' + + with self.setup_files_and_datasets() as (files, [ds1, ds2]): + # open the files with the data_var option + with open_mfdataset(files, data_vars=opt) as ds: + + coord_shape = ds[self.coord_name].shape + coord_shape1 = ds1[self.coord_name].shape + coord_shape2 = ds2[self.coord_name].shape + + var_shape = ds[self.var_name].shape + + # shape pairs to be compared + shape_pairs = [ + (var_shape, coord_shape), + (coord_shape1, coord_shape), + (coord_shape2, coord_shape) + ] + # tests to be applied to respective pairs + tests = [self.assertEqual, + self.assertNotEqual, self.assertNotEqual] + + for a_test, a_shape_pair in zip(tests, shape_pairs): + a_test(*a_shape_pair) + + def test_common_coord_when_datavars_minimal(self): + opt = 'minimal' + + with self.setup_files_and_datasets() as (files, [ds1, ds2]): + # open the files using data_vars option + with open_mfdataset(files, data_vars=opt) as ds: + + coord_shape = ds[self.coord_name].shape + coord_shape1 = ds1[self.coord_name].shape + coord_shape2 = ds2[self.coord_name].shape + + var_shape = ds[self.var_name].shape + + # shape pairs to be compared + shape_pairs = [ + (var_shape, coord_shape), + (coord_shape1, coord_shape), + (coord_shape2, coord_shape) + ] + # tests to be applied to respective pairs + tests = [self.assertNotEqual, + self.assertEqual, self.assertEqual] + + for a_test, a_shape_pair in zip(tests, shape_pairs): + a_test(*a_shape_pair) def test_invalid_data_vars_value_should_fail(self): - with create_tmp_file() as tmpfile1: - with create_tmp_file() as tmpfile2: - ds1, ds2 = self.gen_datasets_with_common_coord_and_time() - - # save data to the temporary files - ds1.to_netcdf(tmpfile1) - ds2.to_netcdf(tmpfile2) - files = [tmpfile1, tmpfile2] - with self.assertRaises(ValueError): - with open_mfdataset(files, data_vars='minimum'): - pass + with self.setup_files_and_datasets() as (files, _): + with self.assertRaises(ValueError): + with open_mfdataset(files, data_vars='minimum'): + pass - # test invalid coord parameter - with self.assertRaises(ValueError): - with open_mfdataset(files, coords='minimum'): - pass + # test invalid coord parameter + with self.assertRaises(ValueError): + with open_mfdataset(files, coords='minimum'): + pass @requires_dask