From baeebeda6b911df6a420b33c7d328d42cb425923 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Sun, 17 Nov 2019 12:35:11 -0800 Subject: [PATCH 01/13] Concat adds default variables when missing from a dataset --- xarray/core/concat.py | 22 ++++++++++++++++++---- xarray/tests/test_combine.py | 10 ++++++---- xarray/tests/test_concat.py | 17 ++++++++++++++--- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 5b4fc078236..abee1ef1095 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,7 +1,9 @@ import pandas as pd +import numpy as np from . import dtypes, utils from .alignment import align +from .common import full_like from .duck_array_ops import lazy_array_equiv from .merge import _VALID_COMPAT, unique_variable from .variable import IndexVariable, Variable, as_variable @@ -370,10 +372,22 @@ def ensure_common_dims(vars): # n.b. this loop preserves variable order, needed for groupby. for k in datasets[0].variables: if k in concat_over: - try: - vars = ensure_common_dims([ds.variables[k] for ds in datasets]) - except KeyError: - raise ValueError("%r is not present in all datasets." % k) + variables = [] + for ds in datasets: + # if one of the variables doesn't exist find one which does + # and use it to create a fill value + if k not in ds.variables: + for ds in datasets: + if k in ds.variables: + # found one to use as a fill value, fill with np.nan + filled = full_like( + ds.variables[k], fill_value=np.nan, dtype=np.double + ) + break + variables.append(filled) + else: + variables.append(ds.variables[k]) + vars = ensure_common_dims(variables) combined = concat_vars(vars, dim, positions) assert isinstance(combined, Variable) result_vars[k] = combined diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index cd26e7fb60b..47a92538474 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -742,10 +742,16 @@ def test_auto_combine(self): Dataset({"x": ("a", [0]), "y": ("a", [0])}), Dataset({"y": ("a", [1]), "x": ("a", [1])}), ] + actual = auto_combine(objs) expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1])}) assert_identical(expected, actual) + objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] + actual = auto_combine(objs) + expected = Dataset({"x": [0], "y": [0, np.nan]}) + assert_identical(expected, actual) + objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})] with raises_regex(ValueError, "too many .* dimensions"): auto_combine(objs) @@ -754,10 +760,6 @@ def test_auto_combine(self): with raises_regex(ValueError, "cannot infer dimension"): auto_combine(objs) - objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] - with raises_regex(ValueError, "'y' is not present in all datasets"): - auto_combine(objs) - def test_auto_combine_previously_failed(self): # In the above scenario, one file is missing, containing the data for # one year's data for one variable. diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 0661ebb7a38..3bf0faf7253 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -35,17 +35,28 @@ def test_concat_compat(): }, coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]}, ) - + ds_concat = Dataset( + { + "has_x_y": ( + ("q", "y", "x"), + [[[np.nan, np.nan], [3, 4]], [[1, 2], [np.nan, np.nan]]], + ), + "has_x": (("q", "x"), [[1, 2], [1, 2]]), + "no_x_y": (("q", "z"), [[1, 2], [1, 2]]), + }, + coords={"x": [0, 1], "y": [0, 1], "z": [-1, -2], "q": [0, np.nan]}, + ) result = concat([ds1, ds2], dim="y", data_vars="minimal", compat="broadcast_equals") assert_equal(ds2.no_x_y, result.no_x_y.transpose()) for var in ["has_x", "no_x_y"]: assert "y" not in result[var] + result2 = concat([ds2, ds1], dim="q") + assert_equal(ds_concat, result2) + with raises_regex(ValueError, "coordinates in some datasets but not others"): concat([ds1, ds2], dim="q") - with raises_regex(ValueError, "'q' is not present in all datasets"): - concat([ds2, ds1], dim="q") class TestConcatDataset: From a96583b52ceac9a6f2d842608e07e0ff47701bb6 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Sun, 17 Nov 2019 12:44:16 -0800 Subject: [PATCH 02/13] Update whats-new.rst with change --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index abd94779435..de5390da140 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -100,6 +100,8 @@ Bug fixes (:issue:`3402`). By `Deepak Cherian `_ - Allow appending datetime and bool data variables to zarr stores. (:issue:`3480`). By `Akihiro Matsukawa `_. +- Make :py:func:`~xarray.concat` more robust when merging variables present in some datasets but + not others (:issue:`508`). By `Scott Chamberlin `_. Documentation ~~~~~~~~~~~~~ From af347e77fbc9482daf8a6e3d1a1a0e6c80d17c7c Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Mon, 18 Nov 2019 09:27:59 -0800 Subject: [PATCH 03/13] Update doc/whats-new.rst Co-Authored-By: Deepak Cherian --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index de5390da140..47ea9f0b6f5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -100,7 +100,7 @@ Bug fixes (:issue:`3402`). By `Deepak Cherian `_ - Allow appending datetime and bool data variables to zarr stores. (:issue:`3480`). By `Akihiro Matsukawa `_. -- Make :py:func:`~xarray.concat` more robust when merging variables present in some datasets but +- Make :py:func:`~xarray.concat` more robust when concatenating variables present in some datasets but not others (:issue:`508`). By `Scott Chamberlin `_. Documentation From 418c538e7ab9f3398c8decd7ca22f0e584cca6cc Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Mon, 18 Nov 2019 09:57:27 -0800 Subject: [PATCH 04/13] Update test to check dtypes for fill values on concat --- xarray/core/concat.py | 12 +++++++++--- xarray/tests/test_concat.py | 16 ++++++++++++++-- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index abee1ef1095..a4a2fab3e24 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,5 +1,4 @@ import pandas as pd -import numpy as np from . import dtypes, utils from .alignment import align @@ -379,9 +378,16 @@ def ensure_common_dims(vars): if k not in ds.variables: for ds in datasets: if k in ds.variables: - # found one to use as a fill value, fill with np.nan + # found one to use as a fill value, fill with fill_value + if fill_value is dtypes.NA: + dtype, fill_value = dtypes.maybe_promote( + ds.variables[k].dtype + ) + else: + dtype = ds.variables[k].dtype + filled = full_like( - ds.variables[k], fill_value=np.nan, dtype=np.double + ds.variables[k], fill_value=fill_value, dtype=dtype ) break variables.append(filled) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 3bf0faf7253..3430af35d43 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -338,17 +338,29 @@ def test_concat_fill_value(self, fill_value): Dataset({"a": ("x", [2, 3]), "x": [1, 2]}), Dataset({"a": ("x", [1, 2]), "x": [0, 1]}), ] + if fill_value == dtypes.NA: # if we supply the default, we expect the missing value for a # float array - fill_value = np.nan + fill_value_expected = np.nan + else: + fill_value_expected = fill_value + expected = Dataset( - {"a": (("t", "x"), [[fill_value, 2, 3], [1, 2, fill_value]])}, + { + "a": ( + ("t", "x"), + [[fill_value_expected, 2, 3], [1, 2, fill_value_expected]], + ) + }, {"x": [0, 1, 2]}, ) actual = concat(datasets, dim="t", fill_value=fill_value) assert_identical(actual, expected) + # check that the dtype is as expected + assert expected.a.dtype == type(fill_value_expected) + class TestConcatDataArray: def test_concat(self): From f7124a347ba99c31e8b6eb5fad460662c2c77ca5 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Mon, 18 Nov 2019 10:02:40 -0800 Subject: [PATCH 05/13] update docstring to explain new fill_value behavior --- xarray/core/concat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index a4a2fab3e24..b1ff5dd81d2 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -78,7 +78,8 @@ def concat( to assign each dataset along the concatenated dimension. If not supplied, objects are concatenated in the provided order. fill_value : scalar, optional - Value to use for newly missing values + Value to use for newly missing values as well as to fill values where the + variable is not present in all datasets. join : {'outer', 'inner', 'left', 'right', 'exact'}, optional String indicating how to combine differing indexes (excluding dim) in objects From 47f7e4d12f9ea5544920b95013de96c20e4e170d Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Tue, 19 Nov 2019 17:01:18 -0800 Subject: [PATCH 06/13] Update doc/whats-new.rst Co-Authored-By: keewis --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 47ea9f0b6f5..232af2c45a2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -101,7 +101,7 @@ Bug fixes - Allow appending datetime and bool data variables to zarr stores. (:issue:`3480`). By `Akihiro Matsukawa `_. - Make :py:func:`~xarray.concat` more robust when concatenating variables present in some datasets but - not others (:issue:`508`). By `Scott Chamberlin `_. + not others (:issue:`508`). By `Scott Chamberlin `_. Documentation ~~~~~~~~~~~~~ From df3693e356cefba5232d2bbe535b1eacc550b3e3 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Tue, 19 Nov 2019 17:17:55 -0800 Subject: [PATCH 07/13] Remove test_combine.py update from pr --- xarray/tests/test_combine.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 47a92538474..cd26e7fb60b 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -742,16 +742,10 @@ def test_auto_combine(self): Dataset({"x": ("a", [0]), "y": ("a", [0])}), Dataset({"y": ("a", [1]), "x": ("a", [1])}), ] - actual = auto_combine(objs) expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1])}) assert_identical(expected, actual) - objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] - actual = auto_combine(objs) - expected = Dataset({"x": [0], "y": [0, np.nan]}) - assert_identical(expected, actual) - objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})] with raises_regex(ValueError, "too many .* dimensions"): auto_combine(objs) @@ -760,6 +754,10 @@ def test_auto_combine(self): with raises_regex(ValueError, "cannot infer dimension"): auto_combine(objs) + objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] + with raises_regex(ValueError, "'y' is not present in all datasets"): + auto_combine(objs) + def test_auto_combine_previously_failed(self): # In the above scenario, one file is missing, containing the data for # one year's data for one variable. From c21dcd4e6c6c9d210eaa8206cb4755ca98faff51 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Mon, 30 Dec 2019 14:59:49 -0800 Subject: [PATCH 08/13] Improved implementation of concat which respects type promotion, ordering and variable types when variables are missing --- xarray/core/concat.py | 122 +++++++--- xarray/tests/test_combine.py | 4 +- xarray/tests/test_concat.py | 426 +++++++++++++++++++++++++++++++++-- 3 files changed, 506 insertions(+), 46 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index b1ff5dd81d2..b3fa694c04a 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,4 +1,5 @@ import pandas as pd +from collections import OrderedDict from . import dtypes, utils from .alignment import align @@ -27,7 +28,7 @@ def concat( xarray objects to concatenate together. Each object is expected to consist of variables and coordinates with matching shapes except for along the concatenated dimension. - dim : str or DataArray or pandas.Index + dim : str, DataArray, Variable, or pandas.Index Name of the dimension to concatenate along. This can either be a new dimension name, in which case it is added along axis=0, or an existing dimension name, in which case the location of the dimension is @@ -131,6 +132,7 @@ def concat( "can only concatenate xarray Dataset and DataArray " "objects, got %s" % type(first_obj) ) + return f(objs, dim, data_vars, coords, compat, positions, fill_value, join) @@ -368,44 +370,102 @@ def ensure_common_dims(vars): var = var.set_dims(common_dims, common_shape) yield var - # stack up each variable to fill-out the dataset (in order) - # n.b. this loop preserves variable order, needed for groupby. - for k in datasets[0].variables: - if k in concat_over: + # Find union of all data variables (preserving order) + # assumes all datasets are relatively in the same order + # and missing variables are inserted in the correct position + # if datasets have variables in drastically different orders + # the resulting order will be dependent on the order they are in the list + # passed to concat + union_of_variables = OrderedDict() + union_of_coordinates = OrderedDict() + for ds in datasets: + var_list = list(ds.variables.keys()) + # this logic maintains the order of the variable list and runs in + # O(n^2) where n is number of variables in the uncommon worst case + # where there are no missing variables this will be O(n) + for i in range(0, len(var_list)): + if var_list[i] not in union_of_variables: + # need to determine the correct place + # first add the new item which will be at the end + union_of_variables[var_list[i]] = None + union_of_variables.move_to_end(var_list[i]) + # move any items after this in the variables list to the end + # this will only happen for missing variables + for j in range(i + 1, len(var_list)): + if var_list[j] in union_of_variables: + union_of_variables.move_to_end(var_list[j]) + + # check that all datasets have the same coordinate set + if len(union_of_coordinates) > 0: + coord_set_diff = ( + union_of_coordinates.keys() ^ ds.coords.keys() + ) & concat_over + if len(coord_set_diff) > 0: + raise ValueError( + "Variables %r are coordinates in some datasets but not others." + % coord_set_diff + ) + + union_of_coordinates = dict( + union_of_coordinates.items() | dict.fromkeys(ds.coords).items() + ) + + # we don't want to fill coordinate variables so remove them + for k in union_of_coordinates.keys(): + union_of_variables.pop(k, None) + + # Cache a filled tmp variable with correct dims for filling missing variables + # doing this here allows us to concat with variables missing from any dataset + # only will run until it finds one protype for each variable in concat list + # we will also only fill defaults for data_vars not coordinates + + # optimization to allow us to break when filling variable + def find_fill_variable_from_ds(variable_key, union_of_variables, datasets): + for ds in datasets: + if union_of_variables[variable_key] is not None: + continue + + if variable_key not in ds.variables: + continue + + v_fill_value = fill_value + if fill_value is dtypes.NA: + dtype, v_fill_value = dtypes.maybe_promote(ds[variable_key].dtype) + else: + dtype = ds[variable_key].dtype + + union_of_variables[variable_key] = full_like( + ds[variable_key], fill_value=v_fill_value, dtype=dtype + ) + return + + for v in union_of_variables.keys(): + find_fill_variable_from_ds(v, union_of_variables, datasets) + + # create the concat list filling in missing variables + while len(union_of_variables) > 0 or len(union_of_coordinates) > 0: + k = None + # get the variables in order + if len(union_of_variables) > 0: + k = union_of_variables.popitem(last=False) + elif len(union_of_coordinates) > 0: + k = union_of_coordinates.popitem() + + if k[0] in concat_over: variables = [] for ds in datasets: - # if one of the variables doesn't exist find one which does - # and use it to create a fill value - if k not in ds.variables: - for ds in datasets: - if k in ds.variables: - # found one to use as a fill value, fill with fill_value - if fill_value is dtypes.NA: - dtype, fill_value = dtypes.maybe_promote( - ds.variables[k].dtype - ) - else: - dtype = ds.variables[k].dtype - - filled = full_like( - ds.variables[k], fill_value=fill_value, dtype=dtype - ) - break - variables.append(filled) + if k[0] in ds.variables: + variables.append(ds.variables[k[0]]) else: - variables.append(ds.variables[k]) + # var is missing, fill with cached value + variables.append(k[1]) + vars = ensure_common_dims(variables) combined = concat_vars(vars, dim, positions) assert isinstance(combined, Variable) - result_vars[k] = combined + result_vars[k[0]] = combined result = Dataset(result_vars, attrs=result_attrs) - absent_coord_names = coord_names - set(result.variables) - if absent_coord_names: - raise ValueError( - "Variables %r are coordinates in some datasets but not others." - % absent_coord_names - ) result = result.set_coords(coord_names) result.encoding = result_encoding diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index cd26e7fb60b..009a50eb97b 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -755,7 +755,9 @@ def test_auto_combine(self): auto_combine(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] - with raises_regex(ValueError, "'y' is not present in all datasets"): + with raises_regex( + ValueError, ".* are coordinates in some datasets but not others" + ): auto_combine(objs) def test_auto_combine_previously_failed(self): diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 3430af35d43..81511c2579f 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import pytest +import random from xarray import DataArray, Dataset, Variable, concat from xarray.core import dtypes, merge @@ -18,6 +19,71 @@ from .test_dataset import create_test_data +# helper method to create multiple tests datasets to concat +def create_concat_datasets(num_datasets=2, seed=None): + random.seed(seed) + result = [] + lat = np.random.randn(1, 4) + lon = np.random.randn(1, 4) + for i in range(num_datasets): + result.append( + Dataset( + data_vars={ + "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)), + }, + coords={ + "lat": (["x", "y"], lat), + "lon": (["x", "y"], lon), + "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], + }, + ) + ) + return result + + +# helper method to create multiple tests datasets to concat with specific types +def create_typed_datasets(num_datasets=2, seed=None): + random.seed(seed) + var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"] + result = [] + lat = np.random.randn(1, 4) + lon = np.random.randn(1, 4) + for i in range(num_datasets): + result.append( + Dataset( + data_vars={ + "float": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "float2": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "string": ( + ["x", "y", "day"], + np.random.choice(var_strings, (1, 4, 2)), + ), + "int": (["x", "y", "day"], np.random.randint(0, 10, (1, 4, 2))), + "datetime64": ( + ["x", "y", "day"], + np.arange( + np.datetime64("2017-01-01"), np.datetime64("2017-01-09") + ).reshape(1, 4, 2), + ), + "timedelta64": ( + ["x", "y", "day"], + np.reshape([pd.Timedelta(days=i) for i in range(8)], [1, 4, 2]), + ), + }, + coords={ + "lat": (["x", "y"], lat), + "lon": (["x", "y"], lon), + "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], + }, + ) + ) + return result + + def test_concat_compat(): ds1 = Dataset( { @@ -35,29 +101,358 @@ def test_concat_compat(): }, coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]}, ) - ds_concat = Dataset( - { - "has_x_y": ( - ("q", "y", "x"), - [[[np.nan, np.nan], [3, 4]], [[1, 2], [np.nan, np.nan]]], - ), - "has_x": (("q", "x"), [[1, 2], [1, 2]]), - "no_x_y": (("q", "z"), [[1, 2], [1, 2]]), - }, - coords={"x": [0, 1], "y": [0, 1], "z": [-1, -2], "q": [0, np.nan]}, - ) + result = concat([ds1, ds2], dim="y", data_vars="minimal", compat="broadcast_equals") assert_equal(ds2.no_x_y, result.no_x_y.transpose()) for var in ["has_x", "no_x_y"]: assert "y" not in result[var] - result2 = concat([ds2, ds1], dim="q") - assert_equal(ds_concat, result2) - with raises_regex(ValueError, "coordinates in some datasets but not others"): concat([ds1, ds2], dim="q") + with raises_regex(ValueError, "coordinates in some datasets but not others"): + concat([ds2, ds1], dim="q") + + +def test_concat_missing_var(): + datasets = create_concat_datasets(2, 123) + vars_to_drop = ["humidity", "precipitation", "cloud cover"] + datasets[0] = datasets[0].drop_vars(vars_to_drop) + datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"]) + + temperature_result = np.concatenate( + (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + assert_equal(result, ds_result) + + +def test_concat_all_empty(): + ds1 = Dataset() + ds2 = Dataset() + result = concat([ds1, ds2], dim="new_dim") + + assert_equal(result, Dataset()) + + +def test_concat_second_empty(): + ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1}) + ds2 = Dataset(coords={"x": 0.1}) + + ds_result = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1}) + result = concat([ds1, ds2], dim="y") + + assert_equal(result, ds_result) + + +def test_multiple_missing_variables(): + datasets = create_concat_datasets(2, 123) + vars_to_drop = ["pressure", "cloud cover"] + datasets[1] = datasets[1].drop_vars(vars_to_drop) + + temperature_result = np.concatenate( + (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + humidity_result = np.concatenate( + (datasets[0].humidity.values, datasets[1].humidity.values), axis=2 + ) + precipitation_result = np.concatenate( + (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2 + ) + cloudcover_result = np.concatenate( + (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + "humidity": (["x", "y", "day"], humidity_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + assert_equal(result, ds_result) + + +def test_multiple_datasets_with_missing_variables(): + vars_to_drop = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud cover", + ] + datasets = create_concat_datasets(len(vars_to_drop), 123) + # set up the test data + datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] + + # set up the validation data + # the below code just drops one var per dataset depending on the location of the + # dataset in the list and allows us to quickly catch any boundaries cases across + # the three equivalence classes of beginning, middle and end of the concat list + result_vars = dict.fromkeys(vars_to_drop) + for i in range(len(vars_to_drop)): + for d in range(len(datasets)): + if d != i: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values + else: + result_vars[vars_to_drop[i]] = np.concatenate( + ( + result_vars[vars_to_drop[i]], + datasets[d][vars_to_drop[i]].values, + ), + axis=2, + ) + else: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan) + else: + result_vars[vars_to_drop[i]] = np.concatenate( + (result_vars[vars_to_drop[i]], np.full([1, 4, 2], np.nan)), + axis=2, + ) + + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], result_vars["temperature"]), + "pressure": (["x", "y", "day"], result_vars["pressure"]), + "humidity": (["x", "y", "day"], result_vars["humidity"]), + "precipitation": (["x", "y", "day"], result_vars["precipitation"]), + "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))], + }, + ) + result = concat(datasets, dim="day") + + assert_equal(result, ds_result) + + +def test_multiple_datasets_with_multiple_missing_variables(): + vars_to_drop_in_first = ["temperature", "pressure"] + vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"] + datasets = create_concat_datasets(2, 123) + # set up the test data + datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) + datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) + + temperature_result = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[1].pressure.values), axis=2 + ) + humidity_result = np.concatenate( + (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + precipitation_result = np.concatenate( + (datasets[0].precipitation.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + cloudcover_result = np.concatenate( + (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + "humidity": (["x", "y", "day"], humidity_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + assert_equal(result, ds_result) + + +def test_type_of_missing_fill(): + datasets = create_typed_datasets(2, 123) + + vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"] + + # set up the test data + datasets[1] = datasets[1].drop_vars(vars[1:]) + + float_result = np.concatenate( + (datasets[0].float.values, datasets[1].float.values), axis=2 + ) + float2_result = np.concatenate( + (datasets[0].float2.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + # to correctly create the expected dataset we need to ensure we promote the string array to + # object type before filling as it will be promoted to that in the concat case. + # this matches the behavior of pandas + string_values = datasets[0].string.values + string_values = string_values.astype(object) + string_result = np.concatenate((string_values, np.full([1, 4, 2], np.nan)), axis=2) + datetime_result = np.concatenate( + (datasets[0].datetime64.values, np.full([1, 4, 2], np.datetime64("NaT"))), + axis=2, + ) + timedelta_result = np.concatenate( + (datasets[0].timedelta64.values, np.full([1, 4, 2], np.timedelta64("NaT"))), + axis=2, + ) + # string_result = string_result.astype(object) + int_result = np.concatenate( + (datasets[0].int.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + ds_result = Dataset( + data_vars={ + "float": (["x", "y", "day"], float_result), + "float2": (["x", "y", "day"], float2_result), + "string": (["x", "y", "day"], string_result), + "int": (["x", "y", "day"], int_result), + "datetime64": (["x", "y", "day"], datetime_result), + "timedelta64": (["x", "y", "day"], timedelta_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day", fill_value=dtypes.NA) + + assert_equal(result, ds_result) + + # test in the reverse order + float_result_rev = np.concatenate( + (datasets[1].float.values, datasets[0].float.values), axis=2 + ) + float2_result_rev = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[0].float2.values), axis=2 + ) + string_result_rev = np.concatenate( + (np.full([1, 4, 2], np.nan), string_values), axis=2 + ) + datetime_result_rev = np.concatenate( + (np.full([1, 4, 2], np.datetime64("NaT")), datasets[0].datetime64.values), + axis=2, + ) + timedelta_result_rev = np.concatenate( + (np.full([1, 4, 2], np.timedelta64("NaT")), datasets[0].timedelta64.values), + axis=2, + ) + int_result_rev = np.concatenate( + (np.full([1, 4, 2], np.nan), datasets[0].int.values), axis=2 + ) + ds_result_rev = Dataset( + data_vars={ + "float": (["x", "y", "day"], float_result_rev), + "float2": (["x", "y", "day"], float2_result_rev), + "string": (["x", "y", "day"], string_result_rev), + "int": (["x", "y", "day"], int_result_rev), + "datetime64": (["x", "y", "day"], datetime_result_rev), + "timedelta64": (["x", "y", "day"], timedelta_result_rev), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day3", "day4", "day1", "day2"], + }, + ) + result_rev = concat(datasets[::-1], dim="day", fill_value=dtypes.NA) + + assert_equal(result_rev, ds_result_rev) + + +def test_order_when_filling_missing(): + vars_to_drop_in_first = [] + # drop middle + vars_to_drop_in_second = ["humidity"] + datasets = create_concat_datasets(2, 123) + # set up the test data + datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) + datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) + + temperature_result = np.concatenate( + (datasets[0].temperature.values, datasets[1].temperature.values), axis=2 + ) + pressure_result = np.concatenate( + (datasets[0].pressure.values, datasets[1].pressure.values), axis=2 + ) + humidity_result = np.concatenate( + (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2 + ) + precipitation_result = np.concatenate( + (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2 + ) + cloudcover_result = np.concatenate( + (datasets[0]["cloud cover"].values, datasets[1]["cloud cover"].values), axis=2 + ) + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + "humidity": (["x", "y", "day"], humidity_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4"], + }, + ) + result = concat(datasets, dim="day") + + assert_equal(result, ds_result) + + result_keys = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud cover", + ] + result_index = 0 + for k in result.data_vars.keys(): + assert k == result_keys[result_index] + result_index += 1 + + # test order when concat in reversed order + rev_result = concat(datasets[::-1], dim="day") + result_index = 0 + for k in rev_result.data_vars.keys(): + assert k == result_keys[result_index] + result_index += 1 + class TestConcatDataset: @pytest.fixture @@ -332,6 +727,7 @@ def test_concat_multiindex(self): assert expected.equals(actual) assert isinstance(actual.x.to_index(), pd.MultiIndex) + # TODO add parameter for missing var @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_concat_fill_value(self, fill_value): datasets = [ @@ -394,6 +790,8 @@ def test_concat(self): expected = foo[:2].rename({"x": "concat_dim"}) assert_identical(expected, actual) + # TODO: is it really correct to expect the new dim to be concat_dim in this case + # I propose its likely better to throw an exception actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True) expected = foo[:2].rename({"x": "concat_dim"}) assert_identical(expected, actual) From 4e01bd98437e52c68c73b52beb90a216e957aa98 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Mon, 30 Dec 2019 16:04:43 -0800 Subject: [PATCH 09/13] refactor variable promotion logic to dtypes.py --- xarray/core/concat.py | 7 +++---- xarray/core/dtypes.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index b3fa694c04a..089253dc59e 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -429,10 +429,9 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets): continue v_fill_value = fill_value - if fill_value is dtypes.NA: - dtype, v_fill_value = dtypes.maybe_promote(ds[variable_key].dtype) - else: - dtype = ds[variable_key].dtype + dtype, v_fill_value = dtypes.get_fill_value_for_variable( + ds[variable_key], fill_value + ) union_of_variables[variable_key] = full_like( ds[variable_key], fill_value=v_fill_value, dtype=dtype diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 4db2990accc..f52ff73a6b6 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -4,6 +4,7 @@ from . import utils + # Use as a sentinel value to indicate a dtype appropriate NA value. NA = utils.ReprObject("") @@ -96,6 +97,37 @@ def get_fill_value(dtype): return fill_value +def get_fill_value_for_variable(variable, fill_value=NA): + """Return an appropriate fill value for this variable + + Parameters + ---------- + variables : DataSet or DataArray + fill_value : a suggested fill value to evaluate and promote if necessary + + Returns + ------- + dtype : Promoted dtype for fill value. + new_fill_value : Missing value corresponding to this dtype. + """ + from .dataset import Dataset + from .dataarray import DataArray + + if not (isinstance(variable, DataArray) or isinstance(variable, Dataset)): + raise TypeError( + "can only get fill value for xarray Dataset and DataArray " + "objects, got %s" % type(variable) + ) + + new_fill_value = fill_value + if fill_value is NA: + dtype, new_fill_value = maybe_promote(variable.dtype) + else: + dtype = variable.dtype + + return dtype, new_fill_value + + def get_pos_infinity(dtype): """Return an appropriate positive infinity for this dtype. From 515b9c19a0b0435d81cd8bc0594672c14dc36680 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Thu, 23 Jan 2020 19:51:41 -0800 Subject: [PATCH 10/13] refactor variable order code to its own internal method --- xarray/core/concat.py | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 089253dc59e..f802e3da2c1 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -380,20 +380,8 @@ def ensure_common_dims(vars): union_of_coordinates = OrderedDict() for ds in datasets: var_list = list(ds.variables.keys()) - # this logic maintains the order of the variable list and runs in - # O(n^2) where n is number of variables in the uncommon worst case - # where there are no missing variables this will be O(n) - for i in range(0, len(var_list)): - if var_list[i] not in union_of_variables: - # need to determine the correct place - # first add the new item which will be at the end - union_of_variables[var_list[i]] = None - union_of_variables.move_to_end(var_list[i]) - # move any items after this in the variables list to the end - # this will only happen for missing variables - for j in range(i + 1, len(var_list)): - if var_list[j] in union_of_variables: - union_of_variables.move_to_end(var_list[j]) + + _find_ordering_inplace(var_list, union_of_variables) # check that all datasets have the same coordinate set if len(union_of_coordinates) > 0: @@ -406,8 +394,8 @@ def ensure_common_dims(vars): % coord_set_diff ) - union_of_coordinates = dict( - union_of_coordinates.items() | dict.fromkeys(ds.coords).items() + union_of_coordinates = OrderedDict( + union_of_coordinates.items() | OrderedDict.fromkeys(ds.coords).items() ) # we don't want to fill coordinate variables so remove them @@ -477,6 +465,26 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets): return result +def _find_ordering_inplace(l, union): + # this logic maintains the order of the variable list and runs in + # O(n^2) where n is number of variables in the uncommon worst case + # where there are no missing variables this will be O(n) + # could potentially be refactored to a more generic function to determine + # a consistent ordering of variables if proper consideration were + # given both to the runtime as well as to the user scenarios + for i in range(0, len(l)): + if l[i] not in union: + # need to determine the correct place + # first add the new item which will be at the end + union[l[i]] = None + union.move_to_end(l[i]) + # move any items after this in the variables list to the end + # this will only happen for missing variables + for j in range(i + 1, len(l)): + if l[j] in union: + union.move_to_end(l[j]) + + def _dataarray_concat( arrays, dim, From cf5b8bdfb0fdaf626ecf7b83590f15aa9aef1d6b Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Fri, 24 Jan 2020 14:02:03 -0800 Subject: [PATCH 11/13] Add test for multiple consecutive missing variables --- xarray/tests/test_concat.py | 94 +++++++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 81511c2579f..752f51ba773 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -140,6 +140,78 @@ def test_concat_missing_var(): ) result = concat(datasets, dim="day") + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + + assert_equal(result, ds_result) + + +def test_concat_missing_muliple_consecutive_var(): + datasets = create_concat_datasets(3, 123) + vars_to_drop = ["pressure", "humidity"] + datasets[0] = datasets[0].drop_vars(vars_to_drop) + datasets[1] = datasets[1].drop_vars(vars_to_drop) + + temperature_result = np.concatenate( + ( + datasets[0].temperature.values, + datasets[1].temperature.values, + datasets[2].temperature.values, + ), + axis=2, + ) + pressure_result = np.concatenate( + ( + np.full([1, 4, 2], np.nan), + np.full([1, 4, 2], np.nan), + datasets[2].pressure.values, + ), + axis=2, + ) + humidity_result = np.concatenate( + ( + np.full([1, 4, 2], np.nan), + np.full([1, 4, 2], np.nan), + datasets[2].humidity.values, + ), + axis=2, + ) + precipitation_result = np.concatenate( + ( + datasets[0].precipitation.values, + datasets[1].precipitation.values, + datasets[2].precipitation.values, + ), + axis=2, + ) + cloudcover_result = np.concatenate( + ( + datasets[0]["cloud cover"].values, + datasets[1]["cloud cover"].values, + datasets[2]["cloud cover"].values, + ), + axis=2, + ) + + ds_result = Dataset( + data_vars={ + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), + "humidity": (["x", "y", "day"], humidity_result), + "precipitation": (["x", "y", "day"], precipitation_result), + "cloud cover": (["x", "y", "day"], cloudcover_result), + }, + coords={ + "lat": (["x", "y"], datasets[0].lat.values), + "lon": (["x", "y"], datasets[0].lon.values), + "day": ["day1", "day2", "day3", "day4", "day5", "day6"], + }, + ) + result = concat(datasets, dim="day") + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same assert_equal(result, ds_result) @@ -197,6 +269,10 @@ def test_multiple_missing_variables(): ) result = concat(datasets, dim="day") + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + assert_equal(result, ds_result) @@ -241,8 +317,10 @@ def test_multiple_datasets_with_missing_variables(): ds_result = Dataset( data_vars={ - "temperature": (["x", "y", "day"], result_vars["temperature"]), + # pressure will be first in this since the first dataset is missing this var + # and there isn't a good way to determine that this should be first "pressure": (["x", "y", "day"], result_vars["pressure"]), + "temperature": (["x", "y", "day"], result_vars["temperature"]), "humidity": (["x", "y", "day"], result_vars["humidity"]), "precipitation": (["x", "y", "day"], result_vars["precipitation"]), "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), @@ -255,6 +333,10 @@ def test_multiple_datasets_with_missing_variables(): ) result = concat(datasets, dim="day") + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + assert_equal(result, ds_result) @@ -283,11 +365,13 @@ def test_multiple_datasets_with_multiple_missing_variables(): ) ds_result = Dataset( data_vars={ - "temperature": (["x", "y", "day"], temperature_result), - "pressure": (["x", "y", "day"], pressure_result), "humidity": (["x", "y", "day"], humidity_result), "precipitation": (["x", "y", "day"], precipitation_result), "cloud cover": (["x", "y", "day"], cloudcover_result), + # these to are at the end of the expected as they are missing from the first + # dataset in the concat list + "temperature": (["x", "y", "day"], temperature_result), + "pressure": (["x", "y", "day"], pressure_result), }, coords={ "lat": (["x", "y"], datasets[0].lat.values), @@ -297,6 +381,10 @@ def test_multiple_datasets_with_multiple_missing_variables(): ) result = concat(datasets, dim="day") + r1 = list(result.data_vars.keys()) + r2 = list(ds_result.data_vars.keys()) + assert r1 == r2 # check the variables orders are the same + assert_equal(result, ds_result) From 3bf393170ae96230f63621cf720102468edaf097 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Sat, 15 Feb 2020 10:30:29 -0800 Subject: [PATCH 12/13] Simplify variable ordering in concat for missing variable case --- xarray/core/concat.py | 44 +++++------- xarray/tests/test_concat.py | 139 ++++++++++++++++++++++++++++++------ 2 files changed, 135 insertions(+), 48 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index f802e3da2c1..961e3b8e727 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -265,21 +265,21 @@ def _parse_datasets(datasets): dims = set() all_coord_names = set() - data_vars = set() # list of data_vars + data_vars = {} # list of data_vars, using dict internally to maintain order dim_coords = {} # maps dim name to variable dims_sizes = {} # shared dimension sizes to expand variables for ds in datasets: dims_sizes.update(ds.dims) all_coord_names.update(ds.coords) - data_vars.update(ds.data_vars) + data_vars.update(dict.fromkeys(ds.data_vars)) for dim in set(ds.dims) - dims: if dim not in dim_coords: dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) - return dim_coords, dims_sizes, all_coord_names, data_vars + return dim_coords, dims_sizes, all_coord_names, list(data_vars.keys()) def _dataset_concat( @@ -308,7 +308,7 @@ def _dataset_concat( dim_names = set(dim_coords) unlabeled_dims = dim_names - coord_names - both_data_and_coords = coord_names & data_names + both_data_and_coords = coord_names & set(data_names) if both_data_and_coords: raise ValueError( "%r is a coordinate in some datasets but not others." % both_data_and_coords @@ -327,7 +327,7 @@ def _dataset_concat( ) # determine which variables to merge, and then merge them according to compat - variables_to_merge = (coord_names | data_names) - concat_over - dim_names + variables_to_merge = (coord_names | set(data_names)) - concat_over - dim_names result_vars = {} if variables_to_merge: @@ -376,27 +376,11 @@ def ensure_common_dims(vars): # if datasets have variables in drastically different orders # the resulting order will be dependent on the order they are in the list # passed to concat - union_of_variables = OrderedDict() - union_of_coordinates = OrderedDict() - for ds in datasets: - var_list = list(ds.variables.keys()) - - _find_ordering_inplace(var_list, union_of_variables) - - # check that all datasets have the same coordinate set - if len(union_of_coordinates) > 0: - coord_set_diff = ( - union_of_coordinates.keys() ^ ds.coords.keys() - ) & concat_over - if len(coord_set_diff) > 0: - raise ValueError( - "Variables %r are coordinates in some datasets but not others." - % coord_set_diff - ) + data_var_order = list(datasets[0].data_vars) + data_var_order += [e for e in data_names if e not in data_var_order] - union_of_coordinates = OrderedDict( - union_of_coordinates.items() | OrderedDict.fromkeys(ds.coords).items() - ) + union_of_variables = OrderedDict.fromkeys(data_var_order) + union_of_coordinates = OrderedDict.fromkeys(coord_names) # we don't want to fill coordinate variables so remove them for k in union_of_coordinates.keys(): @@ -422,7 +406,7 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets): ) union_of_variables[variable_key] = full_like( - ds[variable_key], fill_value=v_fill_value, dtype=dtype + ds.variables[variable_key], fill_value=v_fill_value, dtype=dtype ) return @@ -430,12 +414,14 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets): find_fill_variable_from_ds(v, union_of_variables, datasets) # create the concat list filling in missing variables + filling_coordinates = False while len(union_of_variables) > 0 or len(union_of_coordinates) > 0: k = None # get the variables in order if len(union_of_variables) > 0: k = union_of_variables.popitem(last=False) elif len(union_of_coordinates) > 0: + filling_coordinates = True k = union_of_coordinates.popitem() if k[0] in concat_over: @@ -444,6 +430,12 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets): if k[0] in ds.variables: variables.append(ds.variables[k[0]]) else: + if filling_coordinates: + # in this case the coordinate is missing from a dataset + raise ValueError( + "Variables %r are coordinates in some datasets but not others." + % k[0] + ) # var is missing, fill with cached value variables.append(k[1]) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 752f51ba773..45e2250b3f6 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -20,28 +20,43 @@ # helper method to create multiple tests datasets to concat -def create_concat_datasets(num_datasets=2, seed=None): +def create_concat_datasets(num_datasets=2, seed=None, include_day=True): random.seed(seed) result = [] lat = np.random.randn(1, 4) lon = np.random.randn(1, 4) for i in range(num_datasets): - result.append( - Dataset( - data_vars={ - "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)), - "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)), - "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)), - "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)), - "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)), - }, - coords={ - "lat": (["x", "y"], lat), - "lon": (["x", "y"], lon), - "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], - }, + if include_day: + result.append( + Dataset( + data_vars={ + "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)), + "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)), + }, + coords={ + "lat": (["x", "y"], lat), + "lon": (["x", "y"], lon), + "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], + }, + ) ) - ) + else: + result.append( + Dataset( + data_vars={ + "temperature": (["x", "y"], np.random.randn(1, 4)), + "pressure": (["x", "y"], np.random.randn(1, 4)), + "humidity": (["x", "y"], np.random.randn(1, 4)), + "precipitation": (["x", "y"], np.random.randn(1, 4)), + "cloud cover": (["x", "y"], np.random.randn(1, 4)), + }, + coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)}, + ) + ) + return result @@ -197,10 +212,10 @@ def test_concat_missing_muliple_consecutive_var(): ds_result = Dataset( data_vars={ "temperature": (["x", "y", "day"], temperature_result), - "pressure": (["x", "y", "day"], pressure_result), - "humidity": (["x", "y", "day"], humidity_result), "precipitation": (["x", "y", "day"], precipitation_result), "cloud cover": (["x", "y", "day"], cloudcover_result), + "pressure": (["x", "y", "day"], pressure_result), + "humidity": (["x", "y", "day"], humidity_result), }, coords={ "lat": (["x", "y"], datasets[0].lat.values), @@ -276,6 +291,77 @@ def test_multiple_missing_variables(): assert_equal(result, ds_result) +def test_concat_multiple_datasets_missing_vars_and_new_dim(): + vars_to_drop = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud cover", + ] + datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=False) + # set up the test data + datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))] + + # set up the validation data + # the below code just drops one var per dataset depending on the location of the + # dataset in the list and allows us to quickly catch any boundaries cases across + # the three equivalence classes of beginning, middle and end of the concat list + result_vars = dict.fromkeys(vars_to_drop) + for i in range(len(vars_to_drop)): + for d in range(len(datasets)): + if d != i: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values + else: + result_vars[vars_to_drop[i]] = np.concatenate( + ( + result_vars[vars_to_drop[i]], + datasets[d][vars_to_drop[i]].values, + ), + axis=1, + ) + else: + if result_vars[vars_to_drop[i]] is None: + result_vars[vars_to_drop[i]] = np.full([1, 4], np.nan) + else: + result_vars[vars_to_drop[i]] = np.concatenate( + (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)), axis=1, + ) + # TODO: this test still has two unexpected errors: + # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case + # as we are concating on new dims + # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable + # the resulting set + + # ds_result = Dataset( + # data_vars={ + # # pressure will be first in this since the first dataset is missing this var + # # and there isn't a good way to determine that this should be first + # #this also means temperature will be last as the first data vars will + # #determine the order for all that exist in that dataset + # "pressure": (["x", "y", "day"], result_vars["pressure"]), + # "humidity": (["x", "y", "day"], result_vars["humidity"]), + # "precipitation": (["x", "y", "day"], result_vars["precipitation"]), + # "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), + # "temperature": (["x", "y", "day"], result_vars["temperature"]), + # }, + # coords={ + # "lat": (["x", "y"], datasets[0].lat.values), + # "lon": (["x", "y"], datasets[0].lon.values), + # # "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))], + # }, + # ) + + # result = concat(datasets, dim="day") + + # r1 = list(result.data_vars.keys()) + # r2 = list(ds_result.data_vars.keys()) + # assert r1 == r2 # check the variables orders are the same + + # assert_equal(result, ds_result) + + def test_multiple_datasets_with_missing_variables(): vars_to_drop = [ "temperature", @@ -319,11 +405,13 @@ def test_multiple_datasets_with_missing_variables(): data_vars={ # pressure will be first in this since the first dataset is missing this var # and there isn't a good way to determine that this should be first + # this also means temperature will be last as the first data vars will + # determine the order for all that exist in that dataset "pressure": (["x", "y", "day"], result_vars["pressure"]), - "temperature": (["x", "y", "day"], result_vars["temperature"]), "humidity": (["x", "y", "day"], result_vars["humidity"]), "precipitation": (["x", "y", "day"], result_vars["precipitation"]), "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]), + "temperature": (["x", "y", "day"], result_vars["temperature"]), }, coords={ "lat": (["x", "y"], datasets[0].lat.values), @@ -368,7 +456,7 @@ def test_multiple_datasets_with_multiple_missing_variables(): "humidity": (["x", "y", "day"], humidity_result), "precipitation": (["x", "y", "day"], precipitation_result), "cloud cover": (["x", "y", "day"], cloudcover_result), - # these to are at the end of the expected as they are missing from the first + # these two are at the end of the expected as they are missing from the first # dataset in the concat list "temperature": (["x", "y", "day"], temperature_result), "pressure": (["x", "y", "day"], pressure_result), @@ -508,9 +596,9 @@ def test_order_when_filling_missing(): data_vars={ "temperature": (["x", "y", "day"], temperature_result), "pressure": (["x", "y", "day"], pressure_result), - "humidity": (["x", "y", "day"], humidity_result), "precipitation": (["x", "y", "day"], precipitation_result), "cloud cover": (["x", "y", "day"], cloudcover_result), + "humidity": (["x", "y", "day"], humidity_result), }, coords={ "lat": (["x", "y"], datasets[0].lat.values), @@ -534,11 +622,18 @@ def test_order_when_filling_missing(): assert k == result_keys[result_index] result_index += 1 + result_keys_rev = [ + "temperature", + "pressure", + "precipitation", + "cloud cover", + "humidity", + ] # test order when concat in reversed order rev_result = concat(datasets[::-1], dim="day") result_index = 0 for k in rev_result.data_vars.keys(): - assert k == result_keys[result_index] + assert k == result_keys_rev[result_index] result_index += 1 From 03f9b3b85aee039f47dd693322492ab09f57fb73 Mon Sep 17 00:00:00 2001 From: Scott Chamberlin Date: Sat, 15 Feb 2020 10:44:25 -0800 Subject: [PATCH 13/13] remove unused helper method --- xarray/core/concat.py | 20 -------------------- xarray/tests/test_concat.py | 3 ++- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 961e3b8e727..82bfc0ea4d7 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -457,26 +457,6 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets): return result -def _find_ordering_inplace(l, union): - # this logic maintains the order of the variable list and runs in - # O(n^2) where n is number of variables in the uncommon worst case - # where there are no missing variables this will be O(n) - # could potentially be refactored to a more generic function to determine - # a consistent ordering of variables if proper consideration were - # given both to the runtime as well as to the user scenarios - for i in range(0, len(l)): - if l[i] not in union: - # need to determine the correct place - # first add the new item which will be at the end - union[l[i]] = None - union.move_to_end(l[i]) - # move any items after this in the variables list to the end - # this will only happen for missing variables - for j in range(i + 1, len(l)): - if l[j] in union: - union.move_to_end(l[j]) - - def _dataarray_concat( arrays, dim, diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 45e2250b3f6..f52c729a13e 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -291,6 +291,7 @@ def test_multiple_missing_variables(): assert_equal(result, ds_result) +@pytest.mark.xfail(strict=True) def test_concat_multiple_datasets_missing_vars_and_new_dim(): vars_to_drop = [ "temperature", @@ -329,6 +330,7 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim(): (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)), axis=1, ) # TODO: this test still has two unexpected errors: + # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case # as we are concating on new dims # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable @@ -354,7 +356,6 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim(): # ) # result = concat(datasets, dim="day") - # r1 = list(result.data_vars.keys()) # r2 = list(ds_result.data_vars.keys()) # assert r1 == r2 # check the variables orders are the same