Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

open_mfdataset: Raise if combine='by_coords' and concat_dim=None #5231

Merged
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ New Features
expand, ``False`` to always collapse, or ``default`` to expand unless over a
pre-defined limit (:pull:`5126`).
By `Tom White <https://github.com/tomwhite>`_.
- Prevent passing `concat_dim` to :py:func:`xarray.open_mfdataset` when
`combine='by_coords'` is specified, which should never have been possible (as
:py:func:`xarray.combine_by_coords` has no `concat_dim` argument to pass to).
Also removes unneeded internal reordering of datasets in
:py:func:`xarray.open_mfdataset` when `combine='by_coords'` is specified.
Fixes (:issue:`5230`).
By `Tom Nicholas <https://github.com/TomNicholas>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
22 changes: 16 additions & 6 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,7 @@ def open_mfdataset(
see the full documentation for more details [2]_.
concat_dim : str, or list of str, DataArray, Index or None, optional
Dimensions to concatenate files along. You only need to provide this argument
if ``combine='by_coords'``, and if any of the dimensions along which you want to
if ``combine='nested'``, and if any of the dimensions along which you want to
concatenate is not a dimension in the original datasets, e.g., if you want to
stack a collection of 2D arrays along a third dimension. Set
``concat_dim=[..., None, ...]`` explicitly to disable concatenation along a
Expand Down Expand Up @@ -877,14 +877,24 @@ def open_mfdataset(
if not paths:
raise OSError("no files to open")

# If combine='by_coords' then this is unnecessary, but quick.
# If combine='nested' then this creates a flat list which is easier to
# iterate over, while saving the originally-supplied structure as "ids"
if combine == "nested":
if isinstance(concat_dim, (str, DataArray)) or concat_dim is None:
concat_dim = [concat_dim]
combined_ids_paths = _infer_concat_order_from_positions(paths)
ids, paths = (list(combined_ids_paths.keys()), list(combined_ids_paths.values()))

# This creates a flat list which is easier to iterate over, whilst
# encoding the originally-supplied structure as "ids".
# The "ids" are not used at all if combine='by_coords`.
combined_ids_paths = _infer_concat_order_from_positions(paths)
ids, paths = (
list(combined_ids_paths.keys()),
list(combined_ids_paths.values()),
)

elif combine == "by_coords" and concat_dim is not None:
raise ValueError(
"`concat_dim` can only be used with combine='nested',"
" not with combine='by_coords'"
)

open_kwargs = dict(engine=engine, chunks=chunks or {}, **kwargs)

Expand Down
8 changes: 4 additions & 4 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def combine_nested(

To concatenate along multiple dimensions the datasets must be passed as a
nested list-of-lists, with a depth equal to the length of ``concat_dims``.
``manual_combine`` will concatenate along the top-level list first.
``combine_nested`` will concatenate along the top-level list first.

Useful for combining datasets from a set of nested directories, or for
collecting the output of a simulation parallelized along multiple
Expand Down Expand Up @@ -496,7 +496,7 @@ def combine_nested(
temperature (x, y) float64 1.764 0.4002 -0.1032 ... 0.04576 -0.1872
precipitation (x, y) float64 1.868 -0.9773 0.761 ... -0.7422 0.1549 0.3782

``manual_combine`` can also be used to explicitly merge datasets with
``combine_nested`` can also be used to explicitly merge datasets with
different variables. For example if we have 4 datasets, which are divided
along two times, and contain two different variables, we can pass ``None``
to ``concat_dim`` to specify the dimension of the nested list over which
Expand Down Expand Up @@ -540,7 +540,7 @@ def combine_nested(
if isinstance(concat_dim, (str, DataArray)) or concat_dim is None:
concat_dim = [concat_dim]

# The IDs argument tells _manual_combine that datasets aren't yet sorted
# The IDs argument tells _nested_combine that datasets aren't yet sorted
return _nested_combine(
datasets,
concat_dims=concat_dim,
Expand Down Expand Up @@ -583,7 +583,7 @@ def combine_by_coords(

Aligns coordinates, but different variables on datasets can cause it
to fail under some scenarios. In complex cases, you may need to clean up
your data and use concat/merge explicitly (also see `manual_combine`).
your data and use concat/merge explicitly (also see `combine_nested`).

Works well if, for example, you have N years of data and M data variables,
and each combination of a distinct time period and set of data variables is
Expand Down
32 changes: 18 additions & 14 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -3012,15 +3012,12 @@ def gen_datasets_with_common_coord_and_time(self):

return ds1, ds2

@pytest.mark.parametrize("combine", ["nested", "by_coords"])
@pytest.mark.parametrize("opt", ["all", "minimal", "different"])
@pytest.mark.parametrize("join", ["outer", "inner", "left", "right"])
def test_open_mfdataset_does_same_as_concat(self, combine, opt, join):
def test_open_mfdataset_does_same_as_concat(self, opt, join):
with self.setup_files_and_datasets() as (files, [ds1, ds2]):
if combine == "by_coords":
files.reverse()
with open_mfdataset(
files, data_vars=opt, combine=combine, concat_dim="t", join=join
files, data_vars=opt, combine="nested", concat_dim="t", join=join
dcherian marked this conversation as resolved.
Show resolved Hide resolved
) as ds:
ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t", join=join)
assert_identical(ds, ds_expect)
Expand Down Expand Up @@ -3066,14 +3063,14 @@ def test_open_mfdataset_dataset_combine_attrs(
with pytest.raises(xr.MergeError):
xr.open_mfdataset(
files,
combine="by_coords",
combine="nested",
concat_dim="t",
combine_attrs=combine_attrs,
)
else:
with xr.open_mfdataset(
files,
combine="by_coords",
combine="nested",
concat_dim="t",
combine_attrs=combine_attrs,
) as ds:
Expand All @@ -3091,7 +3088,7 @@ def test_open_mfdataset_dataset_attr_by_coords(self):
ds.close()
ds.to_netcdf(f)

with xr.open_mfdataset(files, combine="by_coords", concat_dim="t") as ds:
with xr.open_mfdataset(files, combine="nested", concat_dim="t") as ds:
assert ds.test_dataset_attr == 10

def test_open_mfdataset_dataarray_attr_by_coords(self):
Expand All @@ -3106,18 +3103,15 @@ def test_open_mfdataset_dataarray_attr_by_coords(self):
ds.close()
ds.to_netcdf(f)

with xr.open_mfdataset(files, combine="by_coords", concat_dim="t") as ds:
with xr.open_mfdataset(files, combine="nested", concat_dim="t") as ds:
assert ds["v1"].test_dataarray_attr == 0

@pytest.mark.parametrize("combine", ["nested", "by_coords"])
@pytest.mark.parametrize("opt", ["all", "minimal", "different"])
def test_open_mfdataset_exact_join_raises_error(self, combine, opt):
def test_open_mfdataset_exact_join_raises_error(self, opt):
with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]):
if combine == "by_coords":
files.reverse()
with pytest.raises(ValueError, match=r"indexes along dimension"):
open_mfdataset(
files, data_vars=opt, combine=combine, concat_dim="t", join="exact"
files, data_vars=opt, combine="nested", concat_dim="t", join="exact"
dcherian marked this conversation as resolved.
Show resolved Hide resolved
)

def test_common_coord_when_datavars_all(self):
Expand Down Expand Up @@ -3401,6 +3395,16 @@ def test_open_mfdataset_auto_combine(self):
with open_mfdataset([tmp2, tmp1], combine="by_coords") as actual:
assert_identical(original, actual)

def test_open_mfdataset_raise_on_bad_combine_args(self):
# Regression test for unhelpful error shown in #5230
original = Dataset({"foo": ("x", np.random.randn(10)), "x": np.arange(10)})
with create_tmp_file() as tmp1:
with create_tmp_file() as tmp2:
original.isel(x=slice(5)).to_netcdf(tmp1)
original.isel(x=slice(5, 10)).to_netcdf(tmp2)
with pytest.raises(ValueError, match="`concat_dim` can only"):
open_mfdataset([tmp1, tmp2], concat_dim="x")

@pytest.mark.xfail(reason="mfdataset loses encoding currently.")
def test_encoding_mfdataset(self):
original = Dataset(
Expand Down