diff --git a/doc/whats-new.rst b/doc/whats-new.rst index abd94779435..232af2c45a2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -100,6 +100,8 @@ Bug fixes (:issue:`3402`). By `Deepak Cherian `_ - Allow appending datetime and bool data variables to zarr stores. (:issue:`3480`). By `Akihiro Matsukawa `_. +- Make :py:func:`~xarray.concat` more robust when concatenating variables present in some datasets but + not others (:issue:`508`). By `Scott Chamberlin `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 5b4fc078236..82bfc0ea4d7 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,7 +1,9 @@ import pandas as pd +from collections import OrderedDict from . import dtypes, utils from .alignment import align +from .common import full_like from .duck_array_ops import lazy_array_equiv from .merge import _VALID_COMPAT, unique_variable from .variable import IndexVariable, Variable, as_variable @@ -26,7 +28,7 @@ def concat( xarray objects to concatenate together. Each object is expected to consist of variables and coordinates with matching shapes except for along the concatenated dimension. - dim : str or DataArray or pandas.Index + dim : str, DataArray, Variable, or pandas.Index Name of the dimension to concatenate along. This can either be a new dimension name, in which case it is added along axis=0, or an existing dimension name, in which case the location of the dimension is @@ -77,7 +79,8 @@ def concat( to assign each dataset along the concatenated dimension. If not supplied, objects are concatenated in the provided order. fill_value : scalar, optional - Value to use for newly missing values + Value to use for newly missing values as well as to fill values where the + variable is not present in all datasets. join : {'outer', 'inner', 'left', 'right', 'exact'}, optional String indicating how to combine differing indexes (excluding dim) in objects @@ -129,6 +132,7 @@ def concat( "can only concatenate xarray Dataset and DataArray " "objects, got %s" % type(first_obj) ) + return f(objs, dim, data_vars, coords, compat, positions, fill_value, join) @@ -261,21 +265,21 @@ def _parse_datasets(datasets): dims = set() all_coord_names = set() - data_vars = set() # list of data_vars + data_vars = {} # list of data_vars, using dict internally to maintain order dim_coords = {} # maps dim name to variable dims_sizes = {} # shared dimension sizes to expand variables for ds in datasets: dims_sizes.update(ds.dims) all_coord_names.update(ds.coords) - data_vars.update(ds.data_vars) + data_vars.update(dict.fromkeys(ds.data_vars)) for dim in set(ds.dims) - dims: if dim not in dim_coords: dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) - return dim_coords, dims_sizes, all_coord_names, data_vars + return dim_coords, dims_sizes, all_coord_names, list(data_vars.keys()) def _dataset_concat( @@ -304,7 +308,7 @@ def _dataset_concat( dim_names = set(dim_coords) unlabeled_dims = dim_names - coord_names - both_data_and_coords = coord_names & data_names + both_data_and_coords = coord_names & set(data_names) if both_data_and_coords: raise ValueError( "%r is a coordinate in some datasets but not others." % both_data_and_coords @@ -323,7 +327,7 @@ def _dataset_concat( ) # determine which variables to merge, and then merge them according to compat - variables_to_merge = (coord_names | data_names) - concat_over - dim_names + variables_to_merge = (coord_names | set(data_names)) - concat_over - dim_names result_vars = {} if variables_to_merge: @@ -366,25 +370,81 @@ def ensure_common_dims(vars): var = var.set_dims(common_dims, common_shape) yield var - # stack up each variable to fill-out the dataset (in order) - # n.b. this loop preserves variable order, needed for groupby. - for k in datasets[0].variables: - if k in concat_over: - try: - vars = ensure_common_dims([ds.variables[k] for ds in datasets]) - except KeyError: - raise ValueError("%r is not present in all datasets." % k) + # Find union of all data variables (preserving order) + # assumes all datasets are relatively in the same order + # and missing variables are inserted in the correct position + # if datasets have variables in drastically different orders + # the resulting order will be dependent on the order they are in the list + # passed to concat + data_var_order = list(datasets[0].data_vars) + data_var_order += [e for e in data_names if e not in data_var_order] + + union_of_variables = OrderedDict.fromkeys(data_var_order) + union_of_coordinates = OrderedDict.fromkeys(coord_names) + + # we don't want to fill coordinate variables so remove them + for k in union_of_coordinates.keys(): + union_of_variables.pop(k, None) + + # Cache a filled tmp variable with correct dims for filling missing variables + # doing this here allows us to concat with variables missing from any dataset + # only will run until it finds one protype for each variable in concat list + # we will also only fill defaults for data_vars not coordinates + + # optimization to allow us to break when filling variable + def find_fill_variable_from_ds(variable_key, union_of_variables, datasets): + for ds in datasets: + if union_of_variables[variable_key] is not None: + continue + + if variable_key not in ds.variables: + continue + + v_fill_value = fill_value + dtype, v_fill_value = dtypes.get_fill_value_for_variable( + ds[variable_key], fill_value + ) + + union_of_variables[variable_key] = full_like( + ds.variables[variable_key], fill_value=v_fill_value, dtype=dtype + ) + return + + for v in union_of_variables.keys(): + find_fill_variable_from_ds(v, union_of_variables, datasets) + + # create the concat list filling in missing variables + filling_coordinates = False + while len(union_of_variables) > 0 or len(union_of_coordinates) > 0: + k = None + # get the variables in order + if len(union_of_variables) > 0: + k = union_of_variables.popitem(last=False) + elif len(union_of_coordinates) > 0: + filling_coordinates = True + k = union_of_coordinates.popitem() + + if k[0] in concat_over: + variables = [] + for ds in datasets: + if k[0] in ds.variables: + variables.append(ds.variables[k[0]]) + else: + if filling_coordinates: + # in this case the coordinate is missing from a dataset + raise ValueError( + "Variables %r are coordinates in some datasets but not others." + % k[0] + ) + # var is missing, fill with cached value + variables.append(k[1]) + + vars = ensure_common_dims(variables) combined = concat_vars(vars, dim, positions) assert isinstance(combined, Variable) - result_vars[k] = combined + result_vars[k[0]] = combined result = Dataset(result_vars, attrs=result_attrs) - absent_coord_names = coord_names - set(result.variables) - if absent_coord_names: - raise ValueError( - "Variables %r are coordinates in some datasets but not others." - % absent_coord_names - ) result = result.set_coords(coord_names) result.encoding = result_encoding diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 4db2990accc..f52ff73a6b6 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -4,6 +4,7 @@ from . import utils + # Use as a sentinel value to indicate a dtype appropriate NA value. NA = utils.ReprObject("") @@ -96,6 +97,37 @@ def get_fill_value(dtype): return fill_value +def get_fill_value_for_variable(variable, fill_value=NA): + """Return an appropriate fill value for this variable + + Parameters + ---------- + variables : DataSet or DataArray + fill_value : a suggested fill value to evaluate and promote if necessary + + Returns + ------- + dtype : Promoted dtype for fill value. + new_fill_value : Missing value corresponding to this dtype. + """ + from .dataset import Dataset + from .dataarray import DataArray + + if not (isinstance(variable, DataArray) or isinstance(variable, Dataset)): + raise TypeError( + "can only get fill value for xarray Dataset and DataArray " + "objects, got %s" % type(variable) + ) + + new_fill_value = fill_value + if fill_value is NA: + dtype, new_fill_value = maybe_promote(variable.dtype) + else: + dtype = variable.dtype + + return dtype, new_fill_value + + def get_pos_infinity(dtype): """Return an appropriate positive infinity for this dtype. diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index cd26e7fb60b..009a50eb97b 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -755,7 +755,9 @@ def test_auto_combine(self): auto_combine(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] - with raises_regex(ValueError, "'y' is not present in all datasets"): + with raises_regex( + ValueError, ".* are coordinates in some datasets but not others" + ): auto_combine(objs) def test_auto_combine_previously_failed(self): diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 0661ebb7a38..f52c729a13e 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import pytest +import random from xarray import DataArray, Dataset, Variable, concat from xarray.core import dtypes, merge @@ -18,6 +19,86 @@ from .test_dataset import create_test_data +# helper method to create multiple tests datasets to concat +def create_concat_datasets(num_datasets=2, seed=None, include_day=True): + random.seed(seed) + result = [] + lat = np.random.randn(1, 4) + lon = np.random.randn(1, 4) + for i in range(num_datasets): + if include_day: + result.append( + Dataset( + data_vars={ + "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)), + }, + coords={ + "lat": (["x", "y"], lat), + "lon": (["x", "y"], lon), + "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], + }, + ) + ) + else: + result.append( + Dataset( + data_vars={ + "temperature": (["x", "y"], np.random.randn(1, 4)), + "pressure": (["x", "y"], np.random.randn(1, 4)), + "humidity": (["x", "y"], np.random.randn(1, 4)), + "precipitation": (["x", "y"], np.random.randn(1, 4)), + "cloud cover": (["x", "y"], np.random.randn(1, 4)), + }, + coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)}, + ) + ) + + return result + + +# helper method to create multiple tests datasets to concat with specific types +def create_typed_datasets(num_datasets=2, seed=None): + random.seed(seed) + var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"] + result = [] + lat = np.random.randn(1, 4) + lon = np.random.randn(1, 4) + for i in range(num_datasets): + result.append( + Dataset( + data_vars={ + "float": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "float2": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "string": ( + ["x", "y", "day"], + np.random.choice(var_strings, (1, 4, 2)), + ), + "int": (["x", "y", "day"], np.random.randint(0, 10, (1, 4, 2))), + "datetime64": ( + ["x", "y", "day"], + np.arange( + np.datetime64("2017-01-01"), np.datetime64("2017-01-09") + ).reshape(1, 4, 2), + ), + "timedelta64": ( + ["x", "y", "day"], + np.reshape([pd.Timedelta(days=i) for i in range(8)], [1, 4, 2]), + ), + }, + coords={ + "lat": (["x", "y"], lat), + "lon": (["x", "y"], lon), + "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], + }, + ) + ) + return result + + def test_concat_compat(): ds1 = Dataset( { @@ -44,10 +125,519 @@ def test_concat_compat(): with raises_regex(ValueError, "coordinates in some datasets but not others"): concat([ds1, ds2], dim="q") - with raises_regex(ValueError, "'q' is not present in all datasets"): + + with raises_regex(ValueError, "coordinates in some datasets but not others"): concat([ds2, ds1], dim="q") +def test_concat_missing_var(): + datasets = create_concat_datasets(2, 123) + vars_to_drop = ["humidity", "precipitation", "cloud cover"] + datasets[0] = datasets[0].drop_vars(vars_to_drop) + datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"]) + + temperature_result = np.concatenate( + (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +def test_concat_missing_muliple_consecutive_var(): + datasets = create_concat_datasets(3, 123) + vars_to_drop = ["pressure", "humidity"] + datasets[0] = datasets[0].drop_vars(vars_to_drop) + datasets[1] = datasets[1].drop_vars(vars_to_drop) + + temperature_result = np.concatenate( + ( + datasets[0].temperature.values, + datasets[1].temperature.values, + datasets[2].temperature.values, + ), + axis=2, + ) + pressure_result = np.concatenate( + ( + np.full([1, 4, 2], np.nan), + np.full([1, 4, 2], np.nan), + datasets[2].pressure.values, + ), + axis=2, + ) + humidity_result = np.concatenate( + ( + np.full([1, 4, 2], np.nan), + np.full([1, 4, 2], np.nan), + datasets[2].humidity.values, + ), + axis=2, + ) + precipitation_result = np.concatenate( + ( + datasets[0].precipitation.values, + datasets[1].precipitation.values, + datasets[2].precipitation.values, + ), + axis=2, + ) + cloudcover_result = np.concatenate( + ( + datasets[0]["cloud cover"].values, + datasets[1]["cloud cover"].values, + datasets[2]["cloud cover"].values, + ), + axis=2, + ) + + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + "pressure": (["x", "y", "day"], pressure_result), + "humidity": (["x", "y", "day"], humidity_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4", "day5", "day6"], + }, + ) + result = concat(datasets, dim="day") + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + assert_equal(result, ds_result) + + +def test_concat_all_empty(): + ds1 = Dataset() + ds2 = Dataset() + result = concat([ds1, ds2], dim="new_dim") + + assert_equal(result, Dataset()) + + +def test_concat_second_empty(): + ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1}) + ds2 = Dataset(coords={"x": 0.1}) + + ds_result = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1}) + result = concat([ds1, ds2], dim="y") + + assert_equal(result, ds_result) + + +def test_multiple_missing_variables(): + datasets = create_concat_datasets(2, 123) + vars_to_drop = ["pressure", "cloud cover"] + datasets[1] = datasets[1].drop_vars(vars_to_drop) + + temperature_result = np.concatenate( + (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + humidity_result = np.concatenate( + (datasets[0].humidity.values, datasets[1].humidity.values), axis=2 + ) + precipitation_result = np.concatenate( + (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2 + ) + cloudcover_result = np.concatenate( + (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + "humidity": (["x", "y", "day"], humidity_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +@pytest.mark.xfail(strict=True) +def test_concat_multiple_datasets_missing_vars_and_new_dim(): + vars_to_drop = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud cover", + ] + datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=False) + # set up the test data + datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] + + # set up the validation data + # the below code just drops one var per dataset depending on the location of the + # dataset in the list and allows us to quickly catch any boundaries cases across + # the three equivalence classes of beginning, middle and end of the concat list + result_vars = dict.fromkeys(vars_to_drop) + for i in range(len(vars_to_drop)): + for d in range(len(datasets)): + if d != i: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values + else: + result_vars[vars_to_drop[i]] = np.concatenate( + ( + result_vars[vars_to_drop[i]], + datasets[d][vars_to_drop[i]].values, + ), + axis=1, + ) + else: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = np.full([1, 4], np.nan) + else: + result_vars[vars_to_drop[i]] = np.concatenate( + (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)), axis=1, + ) + # TODO: this test still has two unexpected errors: + + # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case + # as we are concating on new dims + # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable + # the resulting set + + # ds_result = Dataset( + # data_vars={ + # # pressure will be first in this since the first dataset is missing this var + # # and there isn't a good way to determine that this should be first + # #this also means temperature will be last as the first data vars will + # #determine the order for all that exist in that dataset + # "pressure": (["x", "y", "day"], result_vars["pressure"]), + # "humidity": (["x", "y", "day"], result_vars["humidity"]), + # "precipitation": (["x", "y", "day"], result_vars["precipitation"]), + # "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), + # "temperature": (["x", "y", "day"], result_vars["temperature"]), + # }, + # coords={ + # "lat": (["x", "y"], datasets[0].lat.values), + # "lon": (["x", "y"], datasets[0].lon.values), + # # "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))], + # }, + # ) + + # result = concat(datasets, dim="day") + # r1 = list(result.data_vars.keys()) + # r2 = list(ds_result.data_vars.keys()) + # assert r1 == r2 # check the variables orders are the same + + # assert_equal(result, ds_result) + + +def test_multiple_datasets_with_missing_variables(): + vars_to_drop = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud cover", + ] + datasets = create_concat_datasets(len(vars_to_drop), 123) + # set up the test data + datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] + + # set up the validation data + # the below code just drops one var per dataset depending on the location of the + # dataset in the list and allows us to quickly catch any boundaries cases across + # the three equivalence classes of beginning, middle and end of the concat list + result_vars = dict.fromkeys(vars_to_drop) + for i in range(len(vars_to_drop)): + for d in range(len(datasets)): + if d != i: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values + else: + result_vars[vars_to_drop[i]] = np.concatenate( + ( + result_vars[vars_to_drop[i]], + datasets[d][vars_to_drop[i]].values, + ), + axis=2, + ) + else: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan) + else: + result_vars[vars_to_drop[i]] = np.concatenate( + (result_vars[vars_to_drop[i]], np.full([1, 4, 2], np.nan)), + axis=2, + ) + + ds_result = Dataset( + data_vars={ + # pressure will be first in this since the first dataset is missing this var + # and there isn't a good way to determine that this should be first + # this also means temperature will be last as the first data vars will + # determine the order for all that exist in that dataset + "pressure": (["x", "y", "day"], result_vars["pressure"]), + "humidity": (["x", "y", "day"], result_vars["humidity"]), + "precipitation": (["x", "y", "day"], result_vars["precipitation"]), + "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), + "temperature": (["x", "y", "day"], result_vars["temperature"]), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))], + }, + ) + result = concat(datasets, dim="day") + + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +def test_multiple_datasets_with_multiple_missing_variables(): + vars_to_drop_in_first = ["temperature", "pressure"] + vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"] + datasets = create_concat_datasets(2, 123) + # set up the test data + datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) + datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) + + temperature_result = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[1].pressure.values), axis=2 + ) + humidity_result = np.concatenate( + (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + precipitation_result = np.concatenate( + (datasets[0].precipitation.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + cloudcover_result = np.concatenate( + (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "humidity": (["x", "y", "day"], humidity_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + # these two are at the end of the expected as they are missing from the first + # dataset in the concat list + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +def test_type_of_missing_fill(): + datasets = create_typed_datasets(2, 123) + + vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"] + + # set up the test data + datasets[1] = datasets[1].drop_vars(vars[1:]) + + float_result = np.concatenate( + (datasets[0].float.values, datasets[1].float.values), axis=2 + ) + float2_result = np.concatenate( + (datasets[0].float2.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + # to correctly create the expected dataset we need to ensure we promote the string array to + # object type before filling as it will be promoted to that in the concat case. + # this matches the behavior of pandas + string_values = datasets[0].string.values + string_values = string_values.astype(object) + string_result = np.concatenate((string_values, np.full([1, 4, 2], np.nan)), axis=2) + datetime_result = np.concatenate( + (datasets[0].datetime64.values, np.full([1, 4, 2], np.datetime64("NaT"))), + axis=2, + ) + timedelta_result = np.concatenate( + (datasets[0].timedelta64.values, np.full([1, 4, 2], np.timedelta64("NaT"))), + axis=2, + ) + # string_result = string_result.astype(object) + int_result = np.concatenate( + (datasets[0].int.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "float": (["x", "y", "day"], float_result), + "float2": (["x", "y", "day"], float2_result), + "string": (["x", "y", "day"], string_result), + "int": (["x", "y", "day"], int_result), + "datetime64": (["x", "y", "day"], datetime_result), + "timedelta64": (["x", "y", "day"], timedelta_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day", fill_value=dtypes.NA) + + assert_equal(result, ds_result) + + # test in the reverse order + float_result_rev = np.concatenate( + (datasets[1].float.values, datasets[0].float.values), axis=2 + ) + float2_result_rev = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[0].float2.values), axis=2 + ) + string_result_rev = np.concatenate( + (np.full([1, 4, 2], np.nan), string_values), axis=2 + ) + datetime_result_rev = np.concatenate( + (np.full([1, 4, 2], np.datetime64("NaT")), datasets[0].datetime64.values), + axis=2, + ) + timedelta_result_rev = np.concatenate( + (np.full([1, 4, 2], np.timedelta64("NaT")), datasets[0].timedelta64.values), + axis=2, + ) + int_result_rev = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[0].int.values), axis=2 + ) + ds_result_rev = Dataset( + data_vars={ + "float": (["x", "y", "day"], float_result_rev), + "float2": (["x", "y", "day"], float2_result_rev), + "string": (["x", "y", "day"], string_result_rev), + "int": (["x", "y", "day"], int_result_rev), + "datetime64": (["x", "y", "day"], datetime_result_rev), + "timedelta64": (["x", "y", "day"], timedelta_result_rev), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day3", "day4", "day1", "day2"], + }, + ) + result_rev = concat(datasets[::-1], dim="day", fill_value=dtypes.NA) + + assert_equal(result_rev, ds_result_rev) + + +def test_order_when_filling_missing(): + vars_to_drop_in_first = [] + # drop middle + vars_to_drop_in_second = ["humidity"] + datasets = create_concat_datasets(2, 123) + # set up the test data + datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) + datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) + + temperature_result = np.concatenate( + (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (datasets[0].pressure.values, datasets[1].pressure.values), axis=2 + ) + humidity_result = np.concatenate( + (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + precipitation_result = np.concatenate( + (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2 + ) + cloudcover_result = np.concatenate( + (datasets[0]["cloud cover"].values, datasets[1]["cloud cover"].values), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + "humidity": (["x", "y", "day"], humidity_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + assert_equal(result, ds_result) + + result_keys = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud cover", + ] + result_index = 0 + for k in result.data_vars.keys(): + assert k == result_keys[result_index] + result_index += 1 + + result_keys_rev = [ + "temperature", + "pressure", + "precipitation", + "cloud cover", + "humidity", + ] + # test order when concat in reversed order + rev_result = concat(datasets[::-1], dim="day") + result_index = 0 + for k in rev_result.data_vars.keys(): + assert k == result_keys_rev[result_index] + result_index += 1 + + class TestConcatDataset: @pytest.fixture def data(self): @@ -321,23 +911,36 @@ def test_concat_multiindex(self): assert expected.equals(actual) assert isinstance(actual.x.to_index(), pd.MultiIndex) + # TODO add parameter for missing var @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_concat_fill_value(self, fill_value): datasets = [ Dataset({"a": ("x", [2, 3]), "x": [1, 2]}), Dataset({"a": ("x", [1, 2]), "x": [0, 1]}), ] + if fill_value == dtypes.NA: # if we supply the default, we expect the missing value for a # float array - fill_value = np.nan + fill_value_expected = np.nan + else: + fill_value_expected = fill_value + expected = Dataset( - {"a": (("t", "x"), [[fill_value, 2, 3], [1, 2, fill_value]])}, + { + "a": ( + ("t", "x"), + [[fill_value_expected, 2, 3], [1, 2, fill_value_expected]], + ) + }, {"x": [0, 1, 2]}, ) actual = concat(datasets, dim="t", fill_value=fill_value) assert_identical(actual, expected) + # check that the dtype is as expected + assert expected.a.dtype == type(fill_value_expected) + class TestConcatDataArray: def test_concat(self): @@ -371,6 +974,8 @@ def test_concat(self): expected = foo[:2].rename({"x": "concat_dim"}) assert_identical(expected, actual) + # TODO: is it really correct to expect the new dim to be concat_dim in this case + # I propose its likely better to throw an exception actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True) expected = foo[:2].rename({"x": "concat_dim"}) assert_identical(expected, actual)