diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index abd94779435..232af2c45a2 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -100,6 +100,8 @@ Bug fixes
(:issue:`3402`). By `Deepak Cherian `_
- Allow appending datetime and bool data variables to zarr stores.
(:issue:`3480`). By `Akihiro Matsukawa `_.
+- Make :py:func:`~xarray.concat` more robust when concatenating variables present in some datasets but
+ not others (:issue:`508`). By `Scott Chamberlin `_.
Documentation
~~~~~~~~~~~~~
diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 5b4fc078236..82bfc0ea4d7 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -1,7 +1,9 @@
import pandas as pd
+from collections import OrderedDict
from . import dtypes, utils
from .alignment import align
+from .common import full_like
from .duck_array_ops import lazy_array_equiv
from .merge import _VALID_COMPAT, unique_variable
from .variable import IndexVariable, Variable, as_variable
@@ -26,7 +28,7 @@ def concat(
xarray objects to concatenate together. Each object is expected to
consist of variables and coordinates with matching shapes except for
along the concatenated dimension.
- dim : str or DataArray or pandas.Index
+ dim : str, DataArray, Variable, or pandas.Index
Name of the dimension to concatenate along. This can either be a new
dimension name, in which case it is added along axis=0, or an existing
dimension name, in which case the location of the dimension is
@@ -77,7 +79,8 @@ def concat(
to assign each dataset along the concatenated dimension. If not
supplied, objects are concatenated in the provided order.
fill_value : scalar, optional
- Value to use for newly missing values
+ Value to use for newly missing values as well as to fill values where the
+ variable is not present in all datasets.
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
String indicating how to combine differing indexes
(excluding dim) in objects
@@ -129,6 +132,7 @@ def concat(
"can only concatenate xarray Dataset and DataArray "
"objects, got %s" % type(first_obj)
)
+
return f(objs, dim, data_vars, coords, compat, positions, fill_value, join)
@@ -261,21 +265,21 @@ def _parse_datasets(datasets):
dims = set()
all_coord_names = set()
- data_vars = set() # list of data_vars
+ data_vars = {} # list of data_vars, using dict internally to maintain order
dim_coords = {} # maps dim name to variable
dims_sizes = {} # shared dimension sizes to expand variables
for ds in datasets:
dims_sizes.update(ds.dims)
all_coord_names.update(ds.coords)
- data_vars.update(ds.data_vars)
+ data_vars.update(dict.fromkeys(ds.data_vars))
for dim in set(ds.dims) - dims:
if dim not in dim_coords:
dim_coords[dim] = ds.coords[dim].variable
dims = dims | set(ds.dims)
- return dim_coords, dims_sizes, all_coord_names, data_vars
+ return dim_coords, dims_sizes, all_coord_names, list(data_vars.keys())
def _dataset_concat(
@@ -304,7 +308,7 @@ def _dataset_concat(
dim_names = set(dim_coords)
unlabeled_dims = dim_names - coord_names
- both_data_and_coords = coord_names & data_names
+ both_data_and_coords = coord_names & set(data_names)
if both_data_and_coords:
raise ValueError(
"%r is a coordinate in some datasets but not others." % both_data_and_coords
@@ -323,7 +327,7 @@ def _dataset_concat(
)
# determine which variables to merge, and then merge them according to compat
- variables_to_merge = (coord_names | data_names) - concat_over - dim_names
+ variables_to_merge = (coord_names | set(data_names)) - concat_over - dim_names
result_vars = {}
if variables_to_merge:
@@ -366,25 +370,81 @@ def ensure_common_dims(vars):
var = var.set_dims(common_dims, common_shape)
yield var
- # stack up each variable to fill-out the dataset (in order)
- # n.b. this loop preserves variable order, needed for groupby.
- for k in datasets[0].variables:
- if k in concat_over:
- try:
- vars = ensure_common_dims([ds.variables[k] for ds in datasets])
- except KeyError:
- raise ValueError("%r is not present in all datasets." % k)
+ # Find union of all data variables (preserving order)
+ # assumes all datasets are relatively in the same order
+ # and missing variables are inserted in the correct position
+ # if datasets have variables in drastically different orders
+ # the resulting order will be dependent on the order they are in the list
+ # passed to concat
+ data_var_order = list(datasets[0].data_vars)
+ data_var_order += [e for e in data_names if e not in data_var_order]
+
+ union_of_variables = OrderedDict.fromkeys(data_var_order)
+ union_of_coordinates = OrderedDict.fromkeys(coord_names)
+
+ # we don't want to fill coordinate variables so remove them
+ for k in union_of_coordinates.keys():
+ union_of_variables.pop(k, None)
+
+ # Cache a filled tmp variable with correct dims for filling missing variables
+ # doing this here allows us to concat with variables missing from any dataset
+ # only will run until it finds one protype for each variable in concat list
+ # we will also only fill defaults for data_vars not coordinates
+
+ # optimization to allow us to break when filling variable
+ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets):
+ for ds in datasets:
+ if union_of_variables[variable_key] is not None:
+ continue
+
+ if variable_key not in ds.variables:
+ continue
+
+ v_fill_value = fill_value
+ dtype, v_fill_value = dtypes.get_fill_value_for_variable(
+ ds[variable_key], fill_value
+ )
+
+ union_of_variables[variable_key] = full_like(
+ ds.variables[variable_key], fill_value=v_fill_value, dtype=dtype
+ )
+ return
+
+ for v in union_of_variables.keys():
+ find_fill_variable_from_ds(v, union_of_variables, datasets)
+
+ # create the concat list filling in missing variables
+ filling_coordinates = False
+ while len(union_of_variables) > 0 or len(union_of_coordinates) > 0:
+ k = None
+ # get the variables in order
+ if len(union_of_variables) > 0:
+ k = union_of_variables.popitem(last=False)
+ elif len(union_of_coordinates) > 0:
+ filling_coordinates = True
+ k = union_of_coordinates.popitem()
+
+ if k[0] in concat_over:
+ variables = []
+ for ds in datasets:
+ if k[0] in ds.variables:
+ variables.append(ds.variables[k[0]])
+ else:
+ if filling_coordinates:
+ # in this case the coordinate is missing from a dataset
+ raise ValueError(
+ "Variables %r are coordinates in some datasets but not others."
+ % k[0]
+ )
+ # var is missing, fill with cached value
+ variables.append(k[1])
+
+ vars = ensure_common_dims(variables)
combined = concat_vars(vars, dim, positions)
assert isinstance(combined, Variable)
- result_vars[k] = combined
+ result_vars[k[0]] = combined
result = Dataset(result_vars, attrs=result_attrs)
- absent_coord_names = coord_names - set(result.variables)
- if absent_coord_names:
- raise ValueError(
- "Variables %r are coordinates in some datasets but not others."
- % absent_coord_names
- )
result = result.set_coords(coord_names)
result.encoding = result_encoding
diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py
index 4db2990accc..f52ff73a6b6 100644
--- a/xarray/core/dtypes.py
+++ b/xarray/core/dtypes.py
@@ -4,6 +4,7 @@
from . import utils
+
# Use as a sentinel value to indicate a dtype appropriate NA value.
NA = utils.ReprObject("")
@@ -96,6 +97,37 @@ def get_fill_value(dtype):
return fill_value
+def get_fill_value_for_variable(variable, fill_value=NA):
+ """Return an appropriate fill value for this variable
+
+ Parameters
+ ----------
+ variables : DataSet or DataArray
+ fill_value : a suggested fill value to evaluate and promote if necessary
+
+ Returns
+ -------
+ dtype : Promoted dtype for fill value.
+ new_fill_value : Missing value corresponding to this dtype.
+ """
+ from .dataset import Dataset
+ from .dataarray import DataArray
+
+ if not (isinstance(variable, DataArray) or isinstance(variable, Dataset)):
+ raise TypeError(
+ "can only get fill value for xarray Dataset and DataArray "
+ "objects, got %s" % type(variable)
+ )
+
+ new_fill_value = fill_value
+ if fill_value is NA:
+ dtype, new_fill_value = maybe_promote(variable.dtype)
+ else:
+ dtype = variable.dtype
+
+ return dtype, new_fill_value
+
+
def get_pos_infinity(dtype):
"""Return an appropriate positive infinity for this dtype.
diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py
index cd26e7fb60b..009a50eb97b 100644
--- a/xarray/tests/test_combine.py
+++ b/xarray/tests/test_combine.py
@@ -755,7 +755,9 @@ def test_auto_combine(self):
auto_combine(objs)
objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})]
- with raises_regex(ValueError, "'y' is not present in all datasets"):
+ with raises_regex(
+ ValueError, ".* are coordinates in some datasets but not others"
+ ):
auto_combine(objs)
def test_auto_combine_previously_failed(self):
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 0661ebb7a38..f52c729a13e 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -3,6 +3,7 @@
import numpy as np
import pandas as pd
import pytest
+import random
from xarray import DataArray, Dataset, Variable, concat
from xarray.core import dtypes, merge
@@ -18,6 +19,86 @@
from .test_dataset import create_test_data
+# helper method to create multiple tests datasets to concat
+def create_concat_datasets(num_datasets=2, seed=None, include_day=True):
+ random.seed(seed)
+ result = []
+ lat = np.random.randn(1, 4)
+ lon = np.random.randn(1, 4)
+ for i in range(num_datasets):
+ if include_day:
+ result.append(
+ Dataset(
+ data_vars={
+ "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+ "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+ "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+ "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+ "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+ },
+ coords={
+ "lat": (["x", "y"], lat),
+ "lon": (["x", "y"], lon),
+ "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
+ },
+ )
+ )
+ else:
+ result.append(
+ Dataset(
+ data_vars={
+ "temperature": (["x", "y"], np.random.randn(1, 4)),
+ "pressure": (["x", "y"], np.random.randn(1, 4)),
+ "humidity": (["x", "y"], np.random.randn(1, 4)),
+ "precipitation": (["x", "y"], np.random.randn(1, 4)),
+ "cloud cover": (["x", "y"], np.random.randn(1, 4)),
+ },
+ coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)},
+ )
+ )
+
+ return result
+
+
+# helper method to create multiple tests datasets to concat with specific types
+def create_typed_datasets(num_datasets=2, seed=None):
+ random.seed(seed)
+ var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"]
+ result = []
+ lat = np.random.randn(1, 4)
+ lon = np.random.randn(1, 4)
+ for i in range(num_datasets):
+ result.append(
+ Dataset(
+ data_vars={
+ "float": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+ "float2": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+ "string": (
+ ["x", "y", "day"],
+ np.random.choice(var_strings, (1, 4, 2)),
+ ),
+ "int": (["x", "y", "day"], np.random.randint(0, 10, (1, 4, 2))),
+ "datetime64": (
+ ["x", "y", "day"],
+ np.arange(
+ np.datetime64("2017-01-01"), np.datetime64("2017-01-09")
+ ).reshape(1, 4, 2),
+ ),
+ "timedelta64": (
+ ["x", "y", "day"],
+ np.reshape([pd.Timedelta(days=i) for i in range(8)], [1, 4, 2]),
+ ),
+ },
+ coords={
+ "lat": (["x", "y"], lat),
+ "lon": (["x", "y"], lon),
+ "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
+ },
+ )
+ )
+ return result
+
+
def test_concat_compat():
ds1 = Dataset(
{
@@ -44,10 +125,519 @@ def test_concat_compat():
with raises_regex(ValueError, "coordinates in some datasets but not others"):
concat([ds1, ds2], dim="q")
- with raises_regex(ValueError, "'q' is not present in all datasets"):
+
+ with raises_regex(ValueError, "coordinates in some datasets but not others"):
concat([ds2, ds1], dim="q")
+def test_concat_missing_var():
+ datasets = create_concat_datasets(2, 123)
+ vars_to_drop = ["humidity", "precipitation", "cloud cover"]
+ datasets[0] = datasets[0].drop_vars(vars_to_drop)
+ datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"])
+
+ temperature_result = np.concatenate(
+ (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
+ )
+ pressure_result = np.concatenate(
+ (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2
+ )
+ ds_result = Dataset(
+ data_vars={
+ "temperature": (["x", "y", "day"], temperature_result),
+ "pressure": (["x", "y", "day"], pressure_result),
+ },
+ coords={
+ "lat": (["x", "y"], datasets[0].lat.values),
+ "lon": (["x", "y"], datasets[0].lon.values),
+ "day": ["day1", "day2", "day3", "day4"],
+ },
+ )
+ result = concat(datasets, dim="day")
+
+ r1 = list(result.data_vars.keys())
+ r2 = list(ds_result.data_vars.keys())
+ assert r1 == r2 # check the variables orders are the same
+
+ assert_equal(result, ds_result)
+
+
+def test_concat_missing_muliple_consecutive_var():
+ datasets = create_concat_datasets(3, 123)
+ vars_to_drop = ["pressure", "humidity"]
+ datasets[0] = datasets[0].drop_vars(vars_to_drop)
+ datasets[1] = datasets[1].drop_vars(vars_to_drop)
+
+ temperature_result = np.concatenate(
+ (
+ datasets[0].temperature.values,
+ datasets[1].temperature.values,
+ datasets[2].temperature.values,
+ ),
+ axis=2,
+ )
+ pressure_result = np.concatenate(
+ (
+ np.full([1, 4, 2], np.nan),
+ np.full([1, 4, 2], np.nan),
+ datasets[2].pressure.values,
+ ),
+ axis=2,
+ )
+ humidity_result = np.concatenate(
+ (
+ np.full([1, 4, 2], np.nan),
+ np.full([1, 4, 2], np.nan),
+ datasets[2].humidity.values,
+ ),
+ axis=2,
+ )
+ precipitation_result = np.concatenate(
+ (
+ datasets[0].precipitation.values,
+ datasets[1].precipitation.values,
+ datasets[2].precipitation.values,
+ ),
+ axis=2,
+ )
+ cloudcover_result = np.concatenate(
+ (
+ datasets[0]["cloud cover"].values,
+ datasets[1]["cloud cover"].values,
+ datasets[2]["cloud cover"].values,
+ ),
+ axis=2,
+ )
+
+ ds_result = Dataset(
+ data_vars={
+ "temperature": (["x", "y", "day"], temperature_result),
+ "precipitation": (["x", "y", "day"], precipitation_result),
+ "cloud cover": (["x", "y", "day"], cloudcover_result),
+ "pressure": (["x", "y", "day"], pressure_result),
+ "humidity": (["x", "y", "day"], humidity_result),
+ },
+ coords={
+ "lat": (["x", "y"], datasets[0].lat.values),
+ "lon": (["x", "y"], datasets[0].lon.values),
+ "day": ["day1", "day2", "day3", "day4", "day5", "day6"],
+ },
+ )
+ result = concat(datasets, dim="day")
+ r1 = list(result.data_vars.keys())
+ r2 = list(ds_result.data_vars.keys())
+ assert r1 == r2 # check the variables orders are the same
+ assert_equal(result, ds_result)
+
+
+def test_concat_all_empty():
+ ds1 = Dataset()
+ ds2 = Dataset()
+ result = concat([ds1, ds2], dim="new_dim")
+
+ assert_equal(result, Dataset())
+
+
+def test_concat_second_empty():
+ ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1})
+ ds2 = Dataset(coords={"x": 0.1})
+
+ ds_result = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1})
+ result = concat([ds1, ds2], dim="y")
+
+ assert_equal(result, ds_result)
+
+
+def test_multiple_missing_variables():
+ datasets = create_concat_datasets(2, 123)
+ vars_to_drop = ["pressure", "cloud cover"]
+ datasets[1] = datasets[1].drop_vars(vars_to_drop)
+
+ temperature_result = np.concatenate(
+ (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
+ )
+ pressure_result = np.concatenate(
+ (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2
+ )
+ humidity_result = np.concatenate(
+ (datasets[0].humidity.values, datasets[1].humidity.values), axis=2
+ )
+ precipitation_result = np.concatenate(
+ (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2
+ )
+ cloudcover_result = np.concatenate(
+ (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2
+ )
+ ds_result = Dataset(
+ data_vars={
+ "temperature": (["x", "y", "day"], temperature_result),
+ "pressure": (["x", "y", "day"], pressure_result),
+ "humidity": (["x", "y", "day"], humidity_result),
+ "precipitation": (["x", "y", "day"], precipitation_result),
+ "cloud cover": (["x", "y", "day"], cloudcover_result),
+ },
+ coords={
+ "lat": (["x", "y"], datasets[0].lat.values),
+ "lon": (["x", "y"], datasets[0].lon.values),
+ "day": ["day1", "day2", "day3", "day4"],
+ },
+ )
+ result = concat(datasets, dim="day")
+
+ r1 = list(result.data_vars.keys())
+ r2 = list(ds_result.data_vars.keys())
+ assert r1 == r2 # check the variables orders are the same
+
+ assert_equal(result, ds_result)
+
+
+@pytest.mark.xfail(strict=True)
+def test_concat_multiple_datasets_missing_vars_and_new_dim():
+ vars_to_drop = [
+ "temperature",
+ "pressure",
+ "humidity",
+ "precipitation",
+ "cloud cover",
+ ]
+ datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=False)
+ # set up the test data
+ datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
+
+ # set up the validation data
+ # the below code just drops one var per dataset depending on the location of the
+ # dataset in the list and allows us to quickly catch any boundaries cases across
+ # the three equivalence classes of beginning, middle and end of the concat list
+ result_vars = dict.fromkeys(vars_to_drop)
+ for i in range(len(vars_to_drop)):
+ for d in range(len(datasets)):
+ if d != i:
+ if result_vars[vars_to_drop[i]] is None:
+ result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values
+ else:
+ result_vars[vars_to_drop[i]] = np.concatenate(
+ (
+ result_vars[vars_to_drop[i]],
+ datasets[d][vars_to_drop[i]].values,
+ ),
+ axis=1,
+ )
+ else:
+ if result_vars[vars_to_drop[i]] is None:
+ result_vars[vars_to_drop[i]] = np.full([1, 4], np.nan)
+ else:
+ result_vars[vars_to_drop[i]] = np.concatenate(
+ (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)), axis=1,
+ )
+ # TODO: this test still has two unexpected errors:
+
+ # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case
+ # as we are concating on new dims
+ # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable
+ # the resulting set
+
+ # ds_result = Dataset(
+ # data_vars={
+ # # pressure will be first in this since the first dataset is missing this var
+ # # and there isn't a good way to determine that this should be first
+ # #this also means temperature will be last as the first data vars will
+ # #determine the order for all that exist in that dataset
+ # "pressure": (["x", "y", "day"], result_vars["pressure"]),
+ # "humidity": (["x", "y", "day"], result_vars["humidity"]),
+ # "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
+ # "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
+ # "temperature": (["x", "y", "day"], result_vars["temperature"]),
+ # },
+ # coords={
+ # "lat": (["x", "y"], datasets[0].lat.values),
+ # "lon": (["x", "y"], datasets[0].lon.values),
+ # # "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))],
+ # },
+ # )
+
+ # result = concat(datasets, dim="day")
+ # r1 = list(result.data_vars.keys())
+ # r2 = list(ds_result.data_vars.keys())
+ # assert r1 == r2 # check the variables orders are the same
+
+ # assert_equal(result, ds_result)
+
+
+def test_multiple_datasets_with_missing_variables():
+ vars_to_drop = [
+ "temperature",
+ "pressure",
+ "humidity",
+ "precipitation",
+ "cloud cover",
+ ]
+ datasets = create_concat_datasets(len(vars_to_drop), 123)
+ # set up the test data
+ datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
+
+ # set up the validation data
+ # the below code just drops one var per dataset depending on the location of the
+ # dataset in the list and allows us to quickly catch any boundaries cases across
+ # the three equivalence classes of beginning, middle and end of the concat list
+ result_vars = dict.fromkeys(vars_to_drop)
+ for i in range(len(vars_to_drop)):
+ for d in range(len(datasets)):
+ if d != i:
+ if result_vars[vars_to_drop[i]] is None:
+ result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values
+ else:
+ result_vars[vars_to_drop[i]] = np.concatenate(
+ (
+ result_vars[vars_to_drop[i]],
+ datasets[d][vars_to_drop[i]].values,
+ ),
+ axis=2,
+ )
+ else:
+ if result_vars[vars_to_drop[i]] is None:
+ result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan)
+ else:
+ result_vars[vars_to_drop[i]] = np.concatenate(
+ (result_vars[vars_to_drop[i]], np.full([1, 4, 2], np.nan)),
+ axis=2,
+ )
+
+ ds_result = Dataset(
+ data_vars={
+ # pressure will be first in this since the first dataset is missing this var
+ # and there isn't a good way to determine that this should be first
+ # this also means temperature will be last as the first data vars will
+ # determine the order for all that exist in that dataset
+ "pressure": (["x", "y", "day"], result_vars["pressure"]),
+ "humidity": (["x", "y", "day"], result_vars["humidity"]),
+ "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
+ "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
+ "temperature": (["x", "y", "day"], result_vars["temperature"]),
+ },
+ coords={
+ "lat": (["x", "y"], datasets[0].lat.values),
+ "lon": (["x", "y"], datasets[0].lon.values),
+ "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))],
+ },
+ )
+ result = concat(datasets, dim="day")
+
+ r1 = list(result.data_vars.keys())
+ r2 = list(ds_result.data_vars.keys())
+ assert r1 == r2 # check the variables orders are the same
+
+ assert_equal(result, ds_result)
+
+
+def test_multiple_datasets_with_multiple_missing_variables():
+ vars_to_drop_in_first = ["temperature", "pressure"]
+ vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"]
+ datasets = create_concat_datasets(2, 123)
+ # set up the test data
+ datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
+ datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
+
+ temperature_result = np.concatenate(
+ (np.full([1, 4, 2], np.nan), datasets[1].temperature.values), axis=2
+ )
+ pressure_result = np.concatenate(
+ (np.full([1, 4, 2], np.nan), datasets[1].pressure.values), axis=2
+ )
+ humidity_result = np.concatenate(
+ (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2
+ )
+ precipitation_result = np.concatenate(
+ (datasets[0].precipitation.values, np.full([1, 4, 2], np.nan)), axis=2
+ )
+ cloudcover_result = np.concatenate(
+ (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2
+ )
+ ds_result = Dataset(
+ data_vars={
+ "humidity": (["x", "y", "day"], humidity_result),
+ "precipitation": (["x", "y", "day"], precipitation_result),
+ "cloud cover": (["x", "y", "day"], cloudcover_result),
+ # these two are at the end of the expected as they are missing from the first
+ # dataset in the concat list
+ "temperature": (["x", "y", "day"], temperature_result),
+ "pressure": (["x", "y", "day"], pressure_result),
+ },
+ coords={
+ "lat": (["x", "y"], datasets[0].lat.values),
+ "lon": (["x", "y"], datasets[0].lon.values),
+ "day": ["day1", "day2", "day3", "day4"],
+ },
+ )
+ result = concat(datasets, dim="day")
+
+ r1 = list(result.data_vars.keys())
+ r2 = list(ds_result.data_vars.keys())
+ assert r1 == r2 # check the variables orders are the same
+
+ assert_equal(result, ds_result)
+
+
+def test_type_of_missing_fill():
+ datasets = create_typed_datasets(2, 123)
+
+ vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"]
+
+ # set up the test data
+ datasets[1] = datasets[1].drop_vars(vars[1:])
+
+ float_result = np.concatenate(
+ (datasets[0].float.values, datasets[1].float.values), axis=2
+ )
+ float2_result = np.concatenate(
+ (datasets[0].float2.values, np.full([1, 4, 2], np.nan)), axis=2
+ )
+ # to correctly create the expected dataset we need to ensure we promote the string array to
+ # object type before filling as it will be promoted to that in the concat case.
+ # this matches the behavior of pandas
+ string_values = datasets[0].string.values
+ string_values = string_values.astype(object)
+ string_result = np.concatenate((string_values, np.full([1, 4, 2], np.nan)), axis=2)
+ datetime_result = np.concatenate(
+ (datasets[0].datetime64.values, np.full([1, 4, 2], np.datetime64("NaT"))),
+ axis=2,
+ )
+ timedelta_result = np.concatenate(
+ (datasets[0].timedelta64.values, np.full([1, 4, 2], np.timedelta64("NaT"))),
+ axis=2,
+ )
+ # string_result = string_result.astype(object)
+ int_result = np.concatenate(
+ (datasets[0].int.values, np.full([1, 4, 2], np.nan)), axis=2
+ )
+ ds_result = Dataset(
+ data_vars={
+ "float": (["x", "y", "day"], float_result),
+ "float2": (["x", "y", "day"], float2_result),
+ "string": (["x", "y", "day"], string_result),
+ "int": (["x", "y", "day"], int_result),
+ "datetime64": (["x", "y", "day"], datetime_result),
+ "timedelta64": (["x", "y", "day"], timedelta_result),
+ },
+ coords={
+ "lat": (["x", "y"], datasets[0].lat.values),
+ "lon": (["x", "y"], datasets[0].lon.values),
+ "day": ["day1", "day2", "day3", "day4"],
+ },
+ )
+ result = concat(datasets, dim="day", fill_value=dtypes.NA)
+
+ assert_equal(result, ds_result)
+
+ # test in the reverse order
+ float_result_rev = np.concatenate(
+ (datasets[1].float.values, datasets[0].float.values), axis=2
+ )
+ float2_result_rev = np.concatenate(
+ (np.full([1, 4, 2], np.nan), datasets[0].float2.values), axis=2
+ )
+ string_result_rev = np.concatenate(
+ (np.full([1, 4, 2], np.nan), string_values), axis=2
+ )
+ datetime_result_rev = np.concatenate(
+ (np.full([1, 4, 2], np.datetime64("NaT")), datasets[0].datetime64.values),
+ axis=2,
+ )
+ timedelta_result_rev = np.concatenate(
+ (np.full([1, 4, 2], np.timedelta64("NaT")), datasets[0].timedelta64.values),
+ axis=2,
+ )
+ int_result_rev = np.concatenate(
+ (np.full([1, 4, 2], np.nan), datasets[0].int.values), axis=2
+ )
+ ds_result_rev = Dataset(
+ data_vars={
+ "float": (["x", "y", "day"], float_result_rev),
+ "float2": (["x", "y", "day"], float2_result_rev),
+ "string": (["x", "y", "day"], string_result_rev),
+ "int": (["x", "y", "day"], int_result_rev),
+ "datetime64": (["x", "y", "day"], datetime_result_rev),
+ "timedelta64": (["x", "y", "day"], timedelta_result_rev),
+ },
+ coords={
+ "lat": (["x", "y"], datasets[0].lat.values),
+ "lon": (["x", "y"], datasets[0].lon.values),
+ "day": ["day3", "day4", "day1", "day2"],
+ },
+ )
+ result_rev = concat(datasets[::-1], dim="day", fill_value=dtypes.NA)
+
+ assert_equal(result_rev, ds_result_rev)
+
+
+def test_order_when_filling_missing():
+ vars_to_drop_in_first = []
+ # drop middle
+ vars_to_drop_in_second = ["humidity"]
+ datasets = create_concat_datasets(2, 123)
+ # set up the test data
+ datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
+ datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
+
+ temperature_result = np.concatenate(
+ (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
+ )
+ pressure_result = np.concatenate(
+ (datasets[0].pressure.values, datasets[1].pressure.values), axis=2
+ )
+ humidity_result = np.concatenate(
+ (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2
+ )
+ precipitation_result = np.concatenate(
+ (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2
+ )
+ cloudcover_result = np.concatenate(
+ (datasets[0]["cloud cover"].values, datasets[1]["cloud cover"].values), axis=2
+ )
+ ds_result = Dataset(
+ data_vars={
+ "temperature": (["x", "y", "day"], temperature_result),
+ "pressure": (["x", "y", "day"], pressure_result),
+ "precipitation": (["x", "y", "day"], precipitation_result),
+ "cloud cover": (["x", "y", "day"], cloudcover_result),
+ "humidity": (["x", "y", "day"], humidity_result),
+ },
+ coords={
+ "lat": (["x", "y"], datasets[0].lat.values),
+ "lon": (["x", "y"], datasets[0].lon.values),
+ "day": ["day1", "day2", "day3", "day4"],
+ },
+ )
+ result = concat(datasets, dim="day")
+
+ assert_equal(result, ds_result)
+
+ result_keys = [
+ "temperature",
+ "pressure",
+ "humidity",
+ "precipitation",
+ "cloud cover",
+ ]
+ result_index = 0
+ for k in result.data_vars.keys():
+ assert k == result_keys[result_index]
+ result_index += 1
+
+ result_keys_rev = [
+ "temperature",
+ "pressure",
+ "precipitation",
+ "cloud cover",
+ "humidity",
+ ]
+ # test order when concat in reversed order
+ rev_result = concat(datasets[::-1], dim="day")
+ result_index = 0
+ for k in rev_result.data_vars.keys():
+ assert k == result_keys_rev[result_index]
+ result_index += 1
+
+
class TestConcatDataset:
@pytest.fixture
def data(self):
@@ -321,23 +911,36 @@ def test_concat_multiindex(self):
assert expected.equals(actual)
assert isinstance(actual.x.to_index(), pd.MultiIndex)
+ # TODO add parameter for missing var
@pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0])
def test_concat_fill_value(self, fill_value):
datasets = [
Dataset({"a": ("x", [2, 3]), "x": [1, 2]}),
Dataset({"a": ("x", [1, 2]), "x": [0, 1]}),
]
+
if fill_value == dtypes.NA:
# if we supply the default, we expect the missing value for a
# float array
- fill_value = np.nan
+ fill_value_expected = np.nan
+ else:
+ fill_value_expected = fill_value
+
expected = Dataset(
- {"a": (("t", "x"), [[fill_value, 2, 3], [1, 2, fill_value]])},
+ {
+ "a": (
+ ("t", "x"),
+ [[fill_value_expected, 2, 3], [1, 2, fill_value_expected]],
+ )
+ },
{"x": [0, 1, 2]},
)
actual = concat(datasets, dim="t", fill_value=fill_value)
assert_identical(actual, expected)
+ # check that the dtype is as expected
+ assert expected.a.dtype == type(fill_value_expected)
+
class TestConcatDataArray:
def test_concat(self):
@@ -371,6 +974,8 @@ def test_concat(self):
expected = foo[:2].rename({"x": "concat_dim"})
assert_identical(expected, actual)
+ # TODO: is it really correct to expect the new dim to be concat_dim in this case
+ # I propose its likely better to throw an exception
actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True)
expected = foo[:2].rename({"x": "concat_dim"})
assert_identical(expected, actual)