pydata · jhamman · Oct 10, 2017 · Sep 19, 2017 · Sep 19, 2017 · Sep 19, 2017
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -38,6 +38,19 @@ Backward Incompatible Changes
 
 Enhancements
 ~~~~~~~~~~~~
+- Support for data_vars keyword added to
+  :py:func:`~xarray.open_mfdataset`
+  (:issue:`438`):
+
+  .. ipython::
+    :verbatim:
+    #allows to open multiple files as
+    ds = xarray.open_mfdataset(paths, chunks={"time": 100}, data_vars="minimal")
+    #instead of
+    ds = xarray.concat([xarray.open_dataset(p, chunks={"time": 100}) for p in paths], data_vars="minimal", dim="time")
+    # in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat)
+
+  By `Oleksandr Huziy <https://github.com/guziy>`_.
 
 - Support for `pathlib.Path` objects added to
   :py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`,

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -431,7 +431,7 @@ def close(self):
 
 def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
                    compat='no_conflicts', preprocess=None, engine=None,
-                   lock=None, **kwargs):
+                   lock=None, data_vars='all', **kwargs):
     """Open multiple files as a single dataset.
 
     Requires dask to be installed.  Attributes from the first dataset file
@@ -487,6 +487,18 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
         default, a per-variable lock is used when reading data from netCDF
         files with the netcdf4 and h5netcdf engines to avoid issues with
         concurrent access when using dask's multithreaded backend.
+    data_vars : {'minimal', 'different', 'all' or list of str}, optional
+        These data variables will be concatenated together:
+          * 'minimal': Only data variables in which the dimension already
+            appears are included.
+          * 'different': Data variables which are not equal (ignoring
+            attributes) across all datasets are also concatenated (as well as
+            all for which dimension already appears). Beware: this option may
+            load the data payload of data variables into memory if they are not
+            already loaded.
+          * 'all': All data variables will be concatenated.
+          * list of str: The listed data variables will be concatenated, in
+            addition to the 'minimal' data variables.
     **kwargs : optional
         Additional arguments passed on to :py:func:`xarray.open_dataset`.
 
@@ -516,12 +528,20 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
     if preprocess is not None:
         datasets = [preprocess(ds) for ds in datasets]
 
-    if concat_dim is _CONCAT_DIM_DEFAULT:
-        combined = auto_combine(datasets, compat=compat)
-    else:
-        combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat)
-    combined._file_obj = _MultiFileCloser(file_objs)
-    combined.attrs = datasets[0].attrs
+    # close datasets in case of a ValueError
+    try:
+        if concat_dim is _CONCAT_DIM_DEFAULT:
+            combined = auto_combine(datasets, compat=compat,
+                                    data_vars=data_vars)
+        else:
+            combined = auto_combine(datasets, concat_dim=concat_dim,
+                                    compat=compat, data_vars=data_vars)
+        combined._file_obj = _MultiFileCloser(file_objs)
+        combined.attrs = datasets[0].attrs
+    except ValueError as ve:
+        for ds in datasets:
+            ds.close()
+        raise ve
 
     return combined
 

diff --git a/xarray/core/combine.py b/xarray/core/combine.py
@@ -309,7 +309,7 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
     return arrays[0]._from_temp_dataset(ds, name)
 
 
-def _auto_concat(datasets, dim=None):
+def _auto_concat(datasets, dim=None, data_vars='all'):
     if len(datasets) == 1:
         return datasets[0]
     else:
@@ -331,15 +331,15 @@ def _auto_concat(datasets, dim=None):
                                  'supply the ``concat_dim`` argument '
                                  'explicitly')
             dim, = concat_dims
-        return concat(datasets, dim=dim)
+        return concat(datasets, dim=dim, data_vars=data_vars)
 
 
 _CONCAT_DIM_DEFAULT = '__infer_concat_dim__'
 
 
 def auto_combine(datasets,
                  concat_dim=_CONCAT_DIM_DEFAULT,
-                 compat='no_conflicts'):
+                 compat='no_conflicts', data_vars='all'):
     """Attempt to auto-magically combine the given datasets into one.
 
     This method attempts to combine a list of datasets into a single entity by
@@ -380,6 +380,8 @@ def auto_combine(datasets,
         - 'no_conflicts': only values which are not null in both datasets
           must be equal. The returned dataset then contains the combination
           of all non-null values.
+     data_vars : {'minimal', 'different', 'all' or list of str}, optional
+        Details in the documentation of xarray.concat
 
     Returns
     -------
@@ -395,7 +397,8 @@ def auto_combine(datasets,
         dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
         grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
                                     datasets).values()
-        concatenated = [_auto_concat(ds, dim=dim) for ds in grouped]
+        concatenated = [_auto_concat(ds, dim=dim, data_vars=data_vars)
+                        for ds in grouped]
     else:
         concatenated = datasets
     merged = merge(concatenated, compat=compat)

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -1267,6 +1267,133 @@ def test_4_open_large_num_files_h5netcdf(self):
         self.validate_open_mfdataset_large_num_files(engine=['h5netcdf'])
 
 
+@requires_scipy_or_netCDF4
+class OpenMFDatasetDataVarsKWTest(TestCase):
+    coord_name = 'lon'
+    var_name = 'v1'
+
+    def gen_datasets_with_common_coord_and_time(self):
+        # create coordinate data
+        nx = 10
+        nt = 10
+        x = np.arange(nx)
+        t1 = np.arange(nt)
+        t2 = np.arange(nt, 2 * nt, 1)
+
+        v1 = np.random.randn(nt, nx)
+        v2 = np.random.randn(nt, nx)
+
+        ds1 = Dataset(data_vars={self.var_name: (['t', 'x'], v1),
+                                 self.coord_name: ('x', 2 * x)},
+                      coords={
+                          't': (['t', ], t1),
+                          'x': (['x', ], x)
+                      })
+
+        ds2 = Dataset(data_vars={self.var_name: (['t', 'x'], v2),
+                                 self.coord_name: ('x', 2 * x)},
+                      coords={
+                          't': (['t', ], t2),
+                          'x': (['x', ], x)
+                      })
+
+        return ds1, ds2
+
+    def test_open_mfdataset_does_same_as_concat(self):
+        with create_tmp_file() as tmpfile1:
+            with create_tmp_file() as tmpfile2:
+                ds1, ds2 = self.gen_datasets_with_common_coord_and_time()
+
+                # save data to the temporary files
+                ds1.to_netcdf(tmpfile1)
+                ds2.to_netcdf(tmpfile2)
+
+                files = [tmpfile1, tmpfile2]
+                for opt in ['all', 'minimal']:
+                    with open_mfdataset(files, data_vars=opt) as ds:
+                        kwargs = dict(data_vars=opt, dim='t')
+                        ds_expect = xr.concat([ds1, ds2], **kwargs)
+
+                        data = ds[self.var_name][:]
+                        data_expect = ds_expect[self.var_name][:]
+
+                        coord = ds[self.coord_name][:]
+                        coord_expect = ds_expect[self.coord_name][:]
+
+                        self.assertArrayEqual(data, data_expect)
+                        self.assertArrayEqual(coord, coord_expect)
+
+    def test_common_coord_dims_should_change_when_datavars_all(self):
+        with create_tmp_file() as tmpfile1:
+            with create_tmp_file() as tmpfile2:
+                ds1, ds2 = self.gen_datasets_with_common_coord_and_time()
+
+                # save data to the temporary files
+                ds1.to_netcdf(tmpfile1)
+                ds2.to_netcdf(tmpfile2)
+
+                files = [tmpfile1, tmpfile2]
+                # open the files with the default data_vars='all'
+                with open_mfdataset(files, data_vars='all') as ds:
+
+                    coord_shape = ds[self.coord_name].shape
+                    coord_shape1 = ds1[self.coord_name].shape
+                    coord_shape2 = ds2[self.coord_name].shape
+
+                    var_shape = ds[self.var_name].shape
+                    var_shape1 = ds1[self.var_name].shape
+                    var_shape2 = ds2[self.var_name].shape
+
+                    self.assertNotEqual(coord_shape1, coord_shape)
+                    self.assertNotEqual(coord_shape2, coord_shape)
+
+                    self.assertEqual(var_shape[0],
+                                     var_shape1[0] + var_shape2[0])
+
+                    self.assertEqual(var_shape, coord_shape)
+
+    def test_common_coord_dims_should_not_change_when_datavars_minimal(self):
+        with create_tmp_file() as tmpfile1:
+            with create_tmp_file() as tmpfile2:
+                ds1, ds2 = self.gen_datasets_with_common_coord_and_time()
+
+                # save data to the temporary files
+                ds1.to_netcdf(tmpfile1)
+                ds2.to_netcdf(tmpfile2)
+
+                files = [tmpfile1, tmpfile2]
+                # open the files with the default data_vars='all'
+                with open_mfdataset(files, data_vars='minimal') as ds:
+
+                    coord_shape = ds[self.coord_name].shape
+                    coord_shape1 = ds1[self.coord_name].shape
+                    coord_shape2 = ds2[self.coord_name].shape
+
+                    var_shape = ds[self.var_name].shape
+                    var_shape1 = ds1[self.var_name].shape
+                    var_shape2 = ds2[self.var_name].shape
+
+                    self.assertEqual(coord_shape1, coord_shape)
+
+                    self.assertEqual(coord_shape2, coord_shape)
+                    self.assertEqual(var_shape[0],
+                                     var_shape1[0] + var_shape2[0])
+
+    def test_invalid_data_vars_value_should_fail(self):
+        with self.assertRaises(ValueError):
+            with create_tmp_file() as tmpfile1:
+                with create_tmp_file() as tmpfile2:
+                    ds1, ds2 = self.gen_datasets_with_common_coord_and_time()
+
+                    # save data to the temporary files
+                    ds1.to_netcdf(tmpfile1)
+                    ds2.to_netcdf(tmpfile2)
+
+                    files = [tmpfile1, tmpfile2]
+                    with open_mfdataset(files, data_vars='minimum'):
+                        pass
+
+
 @requires_dask
 @requires_scipy
 @requires_netCDF4