From baeebeda6b911df6a420b33c7d328d42cb425923 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Sun, 17 Nov 2019 12:35:11 -0800
Subject: [PATCH 01/13] Concat adds default variables when missing from a
 dataset

---
 xarray/core/concat.py        | 22 ++++++++++++++++++----
 xarray/tests/test_combine.py | 10 ++++++----
 xarray/tests/test_concat.py  | 17 ++++++++++++++---
 3 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 5b4fc078236..abee1ef1095 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -1,7 +1,9 @@
 import pandas as pd
+import numpy as np
 
 from . import dtypes, utils
 from .alignment import align
+from .common import full_like
 from .duck_array_ops import lazy_array_equiv
 from .merge import _VALID_COMPAT, unique_variable
 from .variable import IndexVariable, Variable, as_variable
@@ -370,10 +372,22 @@ def ensure_common_dims(vars):
     # n.b. this loop preserves variable order, needed for groupby.
     for k in datasets[0].variables:
         if k in concat_over:
-            try:
-                vars = ensure_common_dims([ds.variables[k] for ds in datasets])
-            except KeyError:
-                raise ValueError("%r is not present in all datasets." % k)
+            variables = []
+            for ds in datasets:
+                # if one of the variables doesn't exist find one which does
+                # and use it to create a fill value
+                if k not in ds.variables:
+                    for ds in datasets:
+                        if k in ds.variables:
+                            # found one to use as a fill value, fill with np.nan
+                            filled = full_like(
+                                ds.variables[k], fill_value=np.nan, dtype=np.double
+                            )
+                            break
+                    variables.append(filled)
+                else:
+                    variables.append(ds.variables[k])
+            vars = ensure_common_dims(variables)
             combined = concat_vars(vars, dim, positions)
             assert isinstance(combined, Variable)
             result_vars[k] = combined
diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py
index cd26e7fb60b..47a92538474 100644
--- a/xarray/tests/test_combine.py
+++ b/xarray/tests/test_combine.py
@@ -742,10 +742,16 @@ def test_auto_combine(self):
             Dataset({"x": ("a", [0]), "y": ("a", [0])}),
             Dataset({"y": ("a", [1]), "x": ("a", [1])}),
         ]
+
         actual = auto_combine(objs)
         expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1])})
         assert_identical(expected, actual)
 
+        objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})]
+        actual = auto_combine(objs)
+        expected = Dataset({"x": [0], "y": [0, np.nan]})
+        assert_identical(expected, actual)
+
         objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})]
         with raises_regex(ValueError, "too many .* dimensions"):
             auto_combine(objs)
@@ -754,10 +760,6 @@ def test_auto_combine(self):
         with raises_regex(ValueError, "cannot infer dimension"):
             auto_combine(objs)
 
-        objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})]
-        with raises_regex(ValueError, "'y' is not present in all datasets"):
-            auto_combine(objs)
-
     def test_auto_combine_previously_failed(self):
         # In the above scenario, one file is missing, containing the data for
         # one year's data for one variable.
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 0661ebb7a38..3bf0faf7253 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -35,17 +35,28 @@ def test_concat_compat():
         },
         coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]},
     )
-
+    ds_concat = Dataset(
+        {
+            "has_x_y": (
+                ("q", "y", "x"),
+                [[[np.nan, np.nan], [3, 4]], [[1, 2], [np.nan, np.nan]]],
+            ),
+            "has_x": (("q", "x"), [[1, 2], [1, 2]]),
+            "no_x_y": (("q", "z"), [[1, 2], [1, 2]]),
+        },
+        coords={"x": [0, 1], "y": [0, 1], "z": [-1, -2], "q": [0, np.nan]},
+    )
     result = concat([ds1, ds2], dim="y", data_vars="minimal", compat="broadcast_equals")
     assert_equal(ds2.no_x_y, result.no_x_y.transpose())
 
     for var in ["has_x", "no_x_y"]:
         assert "y" not in result[var]
 
+    result2 = concat([ds2, ds1], dim="q")
+    assert_equal(ds_concat, result2)
+
     with raises_regex(ValueError, "coordinates in some datasets but not others"):
         concat([ds1, ds2], dim="q")
-    with raises_regex(ValueError, "'q' is not present in all datasets"):
-        concat([ds2, ds1], dim="q")
 
 
 class TestConcatDataset:

From a96583b52ceac9a6f2d842608e07e0ff47701bb6 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Sun, 17 Nov 2019 12:44:16 -0800
Subject: [PATCH 02/13] Update whats-new.rst with change

---
 doc/whats-new.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index abd94779435..de5390da140 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -100,6 +100,8 @@ Bug fixes
   (:issue:`3402`). By `Deepak Cherian <https://github.com/dcherian/>`_
 - Allow appending datetime and bool data variables to zarr stores.
   (:issue:`3480`). By `Akihiro Matsukawa <https://github.com/amatsukawa/>`_.
+- Make :py:func:`~xarray.concat` more robust when merging variables present in some datasets but
+  not others (:issue:`508`). By `Scott Chamberlin <http://github.com/scottcha>`_.
 
 Documentation
 ~~~~~~~~~~~~~

From af347e77fbc9482daf8a6e3d1a1a0e6c80d17c7c Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Mon, 18 Nov 2019 09:27:59 -0800
Subject: [PATCH 03/13] Update doc/whats-new.rst

Co-Authored-By: Deepak Cherian <dcherian@users.noreply.github.com>
---
 doc/whats-new.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index de5390da140..47ea9f0b6f5 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -100,7 +100,7 @@ Bug fixes
   (:issue:`3402`). By `Deepak Cherian <https://github.com/dcherian/>`_
 - Allow appending datetime and bool data variables to zarr stores.
   (:issue:`3480`). By `Akihiro Matsukawa <https://github.com/amatsukawa/>`_.
-- Make :py:func:`~xarray.concat` more robust when merging variables present in some datasets but
+- Make :py:func:`~xarray.concat` more robust when concatenating variables present in some datasets but
   not others (:issue:`508`). By `Scott Chamberlin <http://github.com/scottcha>`_.
 
 Documentation

From 418c538e7ab9f3398c8decd7ca22f0e584cca6cc Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Mon, 18 Nov 2019 09:57:27 -0800
Subject: [PATCH 04/13] Update test to check dtypes for fill values on concat

---
 xarray/core/concat.py       | 12 +++++++++---
 xarray/tests/test_concat.py | 16 ++++++++++++++--
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index abee1ef1095..a4a2fab3e24 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -1,5 +1,4 @@
 import pandas as pd
-import numpy as np
 
 from . import dtypes, utils
 from .alignment import align
@@ -379,9 +378,16 @@ def ensure_common_dims(vars):
                 if k not in ds.variables:
                     for ds in datasets:
                         if k in ds.variables:
-                            # found one to use as a fill value, fill with np.nan
+                            # found one to use as a fill value, fill with fill_value
+                            if fill_value is dtypes.NA:
+                                dtype, fill_value = dtypes.maybe_promote(
+                                    ds.variables[k].dtype
+                                )
+                            else:
+                                dtype = ds.variables[k].dtype
+
                             filled = full_like(
-                                ds.variables[k], fill_value=np.nan, dtype=np.double
+                                ds.variables[k], fill_value=fill_value, dtype=dtype
                             )
                             break
                     variables.append(filled)
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 3bf0faf7253..3430af35d43 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -338,17 +338,29 @@ def test_concat_fill_value(self, fill_value):
             Dataset({"a": ("x", [2, 3]), "x": [1, 2]}),
             Dataset({"a": ("x", [1, 2]), "x": [0, 1]}),
         ]
+
         if fill_value == dtypes.NA:
             # if we supply the default, we expect the missing value for a
             # float array
-            fill_value = np.nan
+            fill_value_expected = np.nan
+        else:
+            fill_value_expected = fill_value
+
         expected = Dataset(
-            {"a": (("t", "x"), [[fill_value, 2, 3], [1, 2, fill_value]])},
+            {
+                "a": (
+                    ("t", "x"),
+                    [[fill_value_expected, 2, 3], [1, 2, fill_value_expected]],
+                )
+            },
             {"x": [0, 1, 2]},
         )
         actual = concat(datasets, dim="t", fill_value=fill_value)
         assert_identical(actual, expected)
 
+        # check that the dtype is as expected
+        assert expected.a.dtype == type(fill_value_expected)
+
 
 class TestConcatDataArray:
     def test_concat(self):

From f7124a347ba99c31e8b6eb5fad460662c2c77ca5 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Mon, 18 Nov 2019 10:02:40 -0800
Subject: [PATCH 05/13] update docstring to explain new fill_value behavior

---
 xarray/core/concat.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index a4a2fab3e24..b1ff5dd81d2 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -78,7 +78,8 @@ def concat(
         to assign each dataset along the concatenated dimension. If not
         supplied, objects are concatenated in the provided order.
     fill_value : scalar, optional
-        Value to use for newly missing values
+        Value to use for newly missing values as well as to fill values where the
+        variable is not present in all datasets.
     join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
         String indicating how to combine differing indexes
         (excluding dim) in objects

From 47f7e4d12f9ea5544920b95013de96c20e4e170d Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Tue, 19 Nov 2019 17:01:18 -0800
Subject: [PATCH 06/13] Update doc/whats-new.rst

Co-Authored-By: keewis <keewis@users.noreply.github.com>
---
 doc/whats-new.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 47ea9f0b6f5..232af2c45a2 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -101,7 +101,7 @@ Bug fixes
 - Allow appending datetime and bool data variables to zarr stores.
   (:issue:`3480`). By `Akihiro Matsukawa <https://github.com/amatsukawa/>`_.
 - Make :py:func:`~xarray.concat` more robust when concatenating variables present in some datasets but
-  not others (:issue:`508`). By `Scott Chamberlin <http://github.com/scottcha>`_.
+  not others (:issue:`508`). By `Scott Chamberlin <https://github.com/scottcha>`_.
 
 Documentation
 ~~~~~~~~~~~~~

From df3693e356cefba5232d2bbe535b1eacc550b3e3 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Tue, 19 Nov 2019 17:17:55 -0800
Subject: [PATCH 07/13] Remove test_combine.py update from pr

---
 xarray/tests/test_combine.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py
index 47a92538474..cd26e7fb60b 100644
--- a/xarray/tests/test_combine.py
+++ b/xarray/tests/test_combine.py
@@ -742,16 +742,10 @@ def test_auto_combine(self):
             Dataset({"x": ("a", [0]), "y": ("a", [0])}),
             Dataset({"y": ("a", [1]), "x": ("a", [1])}),
         ]
-
         actual = auto_combine(objs)
         expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1])})
         assert_identical(expected, actual)
 
-        objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})]
-        actual = auto_combine(objs)
-        expected = Dataset({"x": [0], "y": [0, np.nan]})
-        assert_identical(expected, actual)
-
         objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})]
         with raises_regex(ValueError, "too many .* dimensions"):
             auto_combine(objs)
@@ -760,6 +754,10 @@ def test_auto_combine(self):
         with raises_regex(ValueError, "cannot infer dimension"):
             auto_combine(objs)
 
+        objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})]
+        with raises_regex(ValueError, "'y' is not present in all datasets"):
+            auto_combine(objs)
+
     def test_auto_combine_previously_failed(self):
         # In the above scenario, one file is missing, containing the data for
         # one year's data for one variable.

From c21dcd4e6c6c9d210eaa8206cb4755ca98faff51 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Mon, 30 Dec 2019 14:59:49 -0800
Subject: [PATCH 08/13] Improved implementation of concat which respects type
 promotion, ordering and variable types when variables are missing

---
 xarray/core/concat.py        | 122 +++++++---
 xarray/tests/test_combine.py |   4 +-
 xarray/tests/test_concat.py  | 426 +++++++++++++++++++++++++++++++++--
 3 files changed, 506 insertions(+), 46 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index b1ff5dd81d2..b3fa694c04a 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -1,4 +1,5 @@
 import pandas as pd
+from collections import OrderedDict
 
 from . import dtypes, utils
 from .alignment import align
@@ -27,7 +28,7 @@ def concat(
         xarray objects to concatenate together. Each object is expected to
         consist of variables and coordinates with matching shapes except for
         along the concatenated dimension.
-    dim : str or DataArray or pandas.Index
+    dim : str, DataArray, Variable, or pandas.Index
         Name of the dimension to concatenate along. This can either be a new
         dimension name, in which case it is added along axis=0, or an existing
         dimension name, in which case the location of the dimension is
@@ -131,6 +132,7 @@ def concat(
             "can only concatenate xarray Dataset and DataArray "
             "objects, got %s" % type(first_obj)
         )
+
     return f(objs, dim, data_vars, coords, compat, positions, fill_value, join)
 
 
@@ -368,44 +370,102 @@ def ensure_common_dims(vars):
                 var = var.set_dims(common_dims, common_shape)
             yield var
 
-    # stack up each variable to fill-out the dataset (in order)
-    # n.b. this loop preserves variable order, needed for groupby.
-    for k in datasets[0].variables:
-        if k in concat_over:
+    # Find union of all data variables (preserving order)
+    # assumes all datasets are relatively in the same order
+    # and missing variables are inserted in the correct position
+    # if datasets have variables in drastically different orders
+    # the resulting order will be dependent on the order they are in the list
+    # passed to concat
+    union_of_variables = OrderedDict()
+    union_of_coordinates = OrderedDict()
+    for ds in datasets:
+        var_list = list(ds.variables.keys())
+        # this logic maintains the order of the variable list and runs in
+        # O(n^2) where n is number of variables in the uncommon worst case
+        # where there are no missing variables this will be O(n)
+        for i in range(0, len(var_list)):
+            if var_list[i] not in union_of_variables:
+                # need to determine the correct place
+                # first add the new item which will be at the end
+                union_of_variables[var_list[i]] = None
+                union_of_variables.move_to_end(var_list[i])
+                # move any items after this in the variables list to the end
+                # this will only happen for missing variables
+                for j in range(i + 1, len(var_list)):
+                    if var_list[j] in union_of_variables:
+                        union_of_variables.move_to_end(var_list[j])
+
+        # check that all datasets have the same coordinate set
+        if len(union_of_coordinates) > 0:
+            coord_set_diff = (
+                union_of_coordinates.keys() ^ ds.coords.keys()
+            ) & concat_over
+            if len(coord_set_diff) > 0:
+                raise ValueError(
+                    "Variables %r are coordinates in some datasets but not others."
+                    % coord_set_diff
+                )
+
+        union_of_coordinates = dict(
+            union_of_coordinates.items() | dict.fromkeys(ds.coords).items()
+        )
+
+    # we don't want to fill coordinate variables so remove them
+    for k in union_of_coordinates.keys():
+        union_of_variables.pop(k, None)
+
+    # Cache a filled tmp variable with correct dims for filling missing variables
+    # doing this here allows us to concat with variables missing from any dataset
+    # only will run until it finds one protype for each variable in concat list
+    # we will also only fill defaults for data_vars not coordinates
+
+    # optimization to allow us to break when filling variable
+    def find_fill_variable_from_ds(variable_key, union_of_variables, datasets):
+        for ds in datasets:
+            if union_of_variables[variable_key] is not None:
+                continue
+
+            if variable_key not in ds.variables:
+                continue
+
+            v_fill_value = fill_value
+            if fill_value is dtypes.NA:
+                dtype, v_fill_value = dtypes.maybe_promote(ds[variable_key].dtype)
+            else:
+                dtype = ds[variable_key].dtype
+
+            union_of_variables[variable_key] = full_like(
+                ds[variable_key], fill_value=v_fill_value, dtype=dtype
+            )
+            return
+
+    for v in union_of_variables.keys():
+        find_fill_variable_from_ds(v, union_of_variables, datasets)
+
+    # create the concat list filling in missing variables
+    while len(union_of_variables) > 0 or len(union_of_coordinates) > 0:
+        k = None
+        # get the variables in order
+        if len(union_of_variables) > 0:
+            k = union_of_variables.popitem(last=False)
+        elif len(union_of_coordinates) > 0:
+            k = union_of_coordinates.popitem()
+
+        if k[0] in concat_over:
             variables = []
             for ds in datasets:
-                # if one of the variables doesn't exist find one which does
-                # and use it to create a fill value
-                if k not in ds.variables:
-                    for ds in datasets:
-                        if k in ds.variables:
-                            # found one to use as a fill value, fill with fill_value
-                            if fill_value is dtypes.NA:
-                                dtype, fill_value = dtypes.maybe_promote(
-                                    ds.variables[k].dtype
-                                )
-                            else:
-                                dtype = ds.variables[k].dtype
-
-                            filled = full_like(
-                                ds.variables[k], fill_value=fill_value, dtype=dtype
-                            )
-                            break
-                    variables.append(filled)
+                if k[0] in ds.variables:
+                    variables.append(ds.variables[k[0]])
                 else:
-                    variables.append(ds.variables[k])
+                    # var is missing, fill with cached value
+                    variables.append(k[1])
+
             vars = ensure_common_dims(variables)
             combined = concat_vars(vars, dim, positions)
             assert isinstance(combined, Variable)
-            result_vars[k] = combined
+            result_vars[k[0]] = combined
 
     result = Dataset(result_vars, attrs=result_attrs)
-    absent_coord_names = coord_names - set(result.variables)
-    if absent_coord_names:
-        raise ValueError(
-            "Variables %r are coordinates in some datasets but not others."
-            % absent_coord_names
-        )
     result = result.set_coords(coord_names)
     result.encoding = result_encoding
 
diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py
index cd26e7fb60b..009a50eb97b 100644
--- a/xarray/tests/test_combine.py
+++ b/xarray/tests/test_combine.py
@@ -755,7 +755,9 @@ def test_auto_combine(self):
             auto_combine(objs)
 
         objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})]
-        with raises_regex(ValueError, "'y' is not present in all datasets"):
+        with raises_regex(
+            ValueError, ".* are coordinates in some datasets but not others"
+        ):
             auto_combine(objs)
 
     def test_auto_combine_previously_failed(self):
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 3430af35d43..81511c2579f 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import random
 
 from xarray import DataArray, Dataset, Variable, concat
 from xarray.core import dtypes, merge
@@ -18,6 +19,71 @@
 from .test_dataset import create_test_data
 
 
+# helper method to create multiple tests datasets to concat
+def create_concat_datasets(num_datasets=2, seed=None):
+    random.seed(seed)
+    result = []
+    lat = np.random.randn(1, 4)
+    lon = np.random.randn(1, 4)
+    for i in range(num_datasets):
+        result.append(
+            Dataset(
+                data_vars={
+                    "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                },
+                coords={
+                    "lat": (["x", "y"], lat),
+                    "lon": (["x", "y"], lon),
+                    "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
+                },
+            )
+        )
+    return result
+
+
+# helper method to create multiple tests datasets to concat with specific types
+def create_typed_datasets(num_datasets=2, seed=None):
+    random.seed(seed)
+    var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"]
+    result = []
+    lat = np.random.randn(1, 4)
+    lon = np.random.randn(1, 4)
+    for i in range(num_datasets):
+        result.append(
+            Dataset(
+                data_vars={
+                    "float": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    "float2": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    "string": (
+                        ["x", "y", "day"],
+                        np.random.choice(var_strings, (1, 4, 2)),
+                    ),
+                    "int": (["x", "y", "day"], np.random.randint(0, 10, (1, 4, 2))),
+                    "datetime64": (
+                        ["x", "y", "day"],
+                        np.arange(
+                            np.datetime64("2017-01-01"), np.datetime64("2017-01-09")
+                        ).reshape(1, 4, 2),
+                    ),
+                    "timedelta64": (
+                        ["x", "y", "day"],
+                        np.reshape([pd.Timedelta(days=i) for i in range(8)], [1, 4, 2]),
+                    ),
+                },
+                coords={
+                    "lat": (["x", "y"], lat),
+                    "lon": (["x", "y"], lon),
+                    "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
+                },
+            )
+        )
+    return result
+
+
 def test_concat_compat():
     ds1 = Dataset(
         {
@@ -35,29 +101,358 @@ def test_concat_compat():
         },
         coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]},
     )
-    ds_concat = Dataset(
-        {
-            "has_x_y": (
-                ("q", "y", "x"),
-                [[[np.nan, np.nan], [3, 4]], [[1, 2], [np.nan, np.nan]]],
-            ),
-            "has_x": (("q", "x"), [[1, 2], [1, 2]]),
-            "no_x_y": (("q", "z"), [[1, 2], [1, 2]]),
-        },
-        coords={"x": [0, 1], "y": [0, 1], "z": [-1, -2], "q": [0, np.nan]},
-    )
+
     result = concat([ds1, ds2], dim="y", data_vars="minimal", compat="broadcast_equals")
     assert_equal(ds2.no_x_y, result.no_x_y.transpose())
 
     for var in ["has_x", "no_x_y"]:
         assert "y" not in result[var]
 
-    result2 = concat([ds2, ds1], dim="q")
-    assert_equal(ds_concat, result2)
-
     with raises_regex(ValueError, "coordinates in some datasets but not others"):
         concat([ds1, ds2], dim="q")
 
+    with raises_regex(ValueError, "coordinates in some datasets but not others"):
+        concat([ds2, ds1], dim="q")
+
+
+def test_concat_missing_var():
+    datasets = create_concat_datasets(2, 123)
+    vars_to_drop = ["humidity", "precipitation", "cloud cover"]
+    datasets[0] = datasets[0].drop_vars(vars_to_drop)
+    datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"])
+
+    temperature_result = np.concatenate(
+        (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
+    )
+    pressure_result = np.concatenate(
+        (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    assert_equal(result, ds_result)
+
+
+def test_concat_all_empty():
+    ds1 = Dataset()
+    ds2 = Dataset()
+    result = concat([ds1, ds2], dim="new_dim")
+
+    assert_equal(result, Dataset())
+
+
+def test_concat_second_empty():
+    ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1})
+    ds2 = Dataset(coords={"x": 0.1})
+
+    ds_result = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1})
+    result = concat([ds1, ds2], dim="y")
+
+    assert_equal(result, ds_result)
+
+
+def test_multiple_missing_variables():
+    datasets = create_concat_datasets(2, 123)
+    vars_to_drop = ["pressure", "cloud cover"]
+    datasets[1] = datasets[1].drop_vars(vars_to_drop)
+
+    temperature_result = np.concatenate(
+        (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
+    )
+    pressure_result = np.concatenate(
+        (datasets[0].pressure.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    humidity_result = np.concatenate(
+        (datasets[0].humidity.values, datasets[1].humidity.values), axis=2
+    )
+    precipitation_result = np.concatenate(
+        (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2
+    )
+    cloudcover_result = np.concatenate(
+        (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+            "humidity": (["x", "y", "day"], humidity_result),
+            "precipitation": (["x", "y", "day"], precipitation_result),
+            "cloud cover": (["x", "y", "day"], cloudcover_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    assert_equal(result, ds_result)
+
+
+def test_multiple_datasets_with_missing_variables():
+    vars_to_drop = [
+        "temperature",
+        "pressure",
+        "humidity",
+        "precipitation",
+        "cloud cover",
+    ]
+    datasets = create_concat_datasets(len(vars_to_drop), 123)
+    # set up the test data
+    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
+
+    # set up the validation data
+    # the below code just drops one var per dataset depending on the location of the
+    # dataset in the list and allows us to quickly catch any boundaries cases across
+    # the three equivalence classes of beginning, middle and end of the concat list
+    result_vars = dict.fromkeys(vars_to_drop)
+    for i in range(len(vars_to_drop)):
+        for d in range(len(datasets)):
+            if d != i:
+                if result_vars[vars_to_drop[i]] is None:
+                    result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values
+                else:
+                    result_vars[vars_to_drop[i]] = np.concatenate(
+                        (
+                            result_vars[vars_to_drop[i]],
+                            datasets[d][vars_to_drop[i]].values,
+                        ),
+                        axis=2,
+                    )
+            else:
+                if result_vars[vars_to_drop[i]] is None:
+                    result_vars[vars_to_drop[i]] = np.full([1, 4, 2], np.nan)
+                else:
+                    result_vars[vars_to_drop[i]] = np.concatenate(
+                        (result_vars[vars_to_drop[i]], np.full([1, 4, 2], np.nan)),
+                        axis=2,
+                    )
+
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], result_vars["temperature"]),
+            "pressure": (["x", "y", "day"], result_vars["pressure"]),
+            "humidity": (["x", "y", "day"], result_vars["humidity"]),
+            "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
+            "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    assert_equal(result, ds_result)
+
+
+def test_multiple_datasets_with_multiple_missing_variables():
+    vars_to_drop_in_first = ["temperature", "pressure"]
+    vars_to_drop_in_second = ["humidity", "precipitation", "cloud cover"]
+    datasets = create_concat_datasets(2, 123)
+    # set up the test data
+    datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
+    datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
+
+    temperature_result = np.concatenate(
+        (np.full([1, 4, 2], np.nan), datasets[1].temperature.values), axis=2
+    )
+    pressure_result = np.concatenate(
+        (np.full([1, 4, 2], np.nan), datasets[1].pressure.values), axis=2
+    )
+    humidity_result = np.concatenate(
+        (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    precipitation_result = np.concatenate(
+        (datasets[0].precipitation.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    cloudcover_result = np.concatenate(
+        (datasets[0]["cloud cover"].values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+            "humidity": (["x", "y", "day"], humidity_result),
+            "precipitation": (["x", "y", "day"], precipitation_result),
+            "cloud cover": (["x", "y", "day"], cloudcover_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    assert_equal(result, ds_result)
+
+
+def test_type_of_missing_fill():
+    datasets = create_typed_datasets(2, 123)
+
+    vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"]
+
+    # set up the test data
+    datasets[1] = datasets[1].drop_vars(vars[1:])
+
+    float_result = np.concatenate(
+        (datasets[0].float.values, datasets[1].float.values), axis=2
+    )
+    float2_result = np.concatenate(
+        (datasets[0].float2.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    # to correctly create the expected dataset we need to ensure we promote the string array to
+    # object type before filling as it will be promoted to that in the concat case.
+    # this matches the behavior of pandas
+    string_values = datasets[0].string.values
+    string_values = string_values.astype(object)
+    string_result = np.concatenate((string_values, np.full([1, 4, 2], np.nan)), axis=2)
+    datetime_result = np.concatenate(
+        (datasets[0].datetime64.values, np.full([1, 4, 2], np.datetime64("NaT"))),
+        axis=2,
+    )
+    timedelta_result = np.concatenate(
+        (datasets[0].timedelta64.values, np.full([1, 4, 2], np.timedelta64("NaT"))),
+        axis=2,
+    )
+    # string_result = string_result.astype(object)
+    int_result = np.concatenate(
+        (datasets[0].int.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "float": (["x", "y", "day"], float_result),
+            "float2": (["x", "y", "day"], float2_result),
+            "string": (["x", "y", "day"], string_result),
+            "int": (["x", "y", "day"], int_result),
+            "datetime64": (["x", "y", "day"], datetime_result),
+            "timedelta64": (["x", "y", "day"], timedelta_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day", fill_value=dtypes.NA)
+
+    assert_equal(result, ds_result)
+
+    # test in the reverse order
+    float_result_rev = np.concatenate(
+        (datasets[1].float.values, datasets[0].float.values), axis=2
+    )
+    float2_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.nan), datasets[0].float2.values), axis=2
+    )
+    string_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.nan), string_values), axis=2
+    )
+    datetime_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.datetime64("NaT")), datasets[0].datetime64.values),
+        axis=2,
+    )
+    timedelta_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.timedelta64("NaT")), datasets[0].timedelta64.values),
+        axis=2,
+    )
+    int_result_rev = np.concatenate(
+        (np.full([1, 4, 2], np.nan), datasets[0].int.values), axis=2
+    )
+    ds_result_rev = Dataset(
+        data_vars={
+            "float": (["x", "y", "day"], float_result_rev),
+            "float2": (["x", "y", "day"], float2_result_rev),
+            "string": (["x", "y", "day"], string_result_rev),
+            "int": (["x", "y", "day"], int_result_rev),
+            "datetime64": (["x", "y", "day"], datetime_result_rev),
+            "timedelta64": (["x", "y", "day"], timedelta_result_rev),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day3", "day4", "day1", "day2"],
+        },
+    )
+    result_rev = concat(datasets[::-1], dim="day", fill_value=dtypes.NA)
+
+    assert_equal(result_rev, ds_result_rev)
+
+
+def test_order_when_filling_missing():
+    vars_to_drop_in_first = []
+    # drop middle
+    vars_to_drop_in_second = ["humidity"]
+    datasets = create_concat_datasets(2, 123)
+    # set up the test data
+    datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
+    datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
+
+    temperature_result = np.concatenate(
+        (datasets[0].temperature.values, datasets[1].temperature.values), axis=2
+    )
+    pressure_result = np.concatenate(
+        (datasets[0].pressure.values, datasets[1].pressure.values), axis=2
+    )
+    humidity_result = np.concatenate(
+        (datasets[0].humidity.values, np.full([1, 4, 2], np.nan)), axis=2
+    )
+    precipitation_result = np.concatenate(
+        (datasets[0].precipitation.values, datasets[1].precipitation.values), axis=2
+    )
+    cloudcover_result = np.concatenate(
+        (datasets[0]["cloud cover"].values, datasets[1]["cloud cover"].values), axis=2
+    )
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+            "humidity": (["x", "y", "day"], humidity_result),
+            "precipitation": (["x", "y", "day"], precipitation_result),
+            "cloud cover": (["x", "y", "day"], cloudcover_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4"],
+        },
+    )
+    result = concat(datasets, dim="day")
+
+    assert_equal(result, ds_result)
+
+    result_keys = [
+        "temperature",
+        "pressure",
+        "humidity",
+        "precipitation",
+        "cloud cover",
+    ]
+    result_index = 0
+    for k in result.data_vars.keys():
+        assert k == result_keys[result_index]
+        result_index += 1
+
+    # test order when concat in reversed order
+    rev_result = concat(datasets[::-1], dim="day")
+    result_index = 0
+    for k in rev_result.data_vars.keys():
+        assert k == result_keys[result_index]
+        result_index += 1
+
 
 class TestConcatDataset:
     @pytest.fixture
@@ -332,6 +727,7 @@ def test_concat_multiindex(self):
         assert expected.equals(actual)
         assert isinstance(actual.x.to_index(), pd.MultiIndex)
 
+    # TODO add parameter for missing var
     @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0])
     def test_concat_fill_value(self, fill_value):
         datasets = [
@@ -394,6 +790,8 @@ def test_concat(self):
         expected = foo[:2].rename({"x": "concat_dim"})
         assert_identical(expected, actual)
 
+        # TODO: is it really correct to expect the new dim to be concat_dim in this case
+        # I propose its likely better to throw an exception
         actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True)
         expected = foo[:2].rename({"x": "concat_dim"})
         assert_identical(expected, actual)

From 4e01bd98437e52c68c73b52beb90a216e957aa98 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Mon, 30 Dec 2019 16:04:43 -0800
Subject: [PATCH 09/13] refactor variable promotion logic to dtypes.py

---
 xarray/core/concat.py |  7 +++----
 xarray/core/dtypes.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index b3fa694c04a..089253dc59e 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -429,10 +429,9 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets):
                 continue
 
             v_fill_value = fill_value
-            if fill_value is dtypes.NA:
-                dtype, v_fill_value = dtypes.maybe_promote(ds[variable_key].dtype)
-            else:
-                dtype = ds[variable_key].dtype
+            dtype, v_fill_value = dtypes.get_fill_value_for_variable(
+                ds[variable_key], fill_value
+            )
 
             union_of_variables[variable_key] = full_like(
                 ds[variable_key], fill_value=v_fill_value, dtype=dtype
diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py
index 4db2990accc..f52ff73a6b6 100644
--- a/xarray/core/dtypes.py
+++ b/xarray/core/dtypes.py
@@ -4,6 +4,7 @@
 
 from . import utils
 
+
 # Use as a sentinel value to indicate a dtype appropriate NA value.
 NA = utils.ReprObject("<NA>")
 
@@ -96,6 +97,37 @@ def get_fill_value(dtype):
     return fill_value
 
 
+def get_fill_value_for_variable(variable, fill_value=NA):
+    """Return an appropriate fill value for this variable
+
+    Parameters
+    ----------
+    variables : DataSet or DataArray
+    fill_value : a suggested fill value to evaluate and promote if necessary
+
+    Returns
+    -------
+    dtype : Promoted dtype for fill value.
+    new_fill_value : Missing value corresponding to this dtype.
+    """
+    from .dataset import Dataset
+    from .dataarray import DataArray
+
+    if not (isinstance(variable, DataArray) or isinstance(variable, Dataset)):
+        raise TypeError(
+            "can only get fill value for xarray Dataset and DataArray "
+            "objects, got %s" % type(variable)
+        )
+
+    new_fill_value = fill_value
+    if fill_value is NA:
+        dtype, new_fill_value = maybe_promote(variable.dtype)
+    else:
+        dtype = variable.dtype
+
+    return dtype, new_fill_value
+
+
 def get_pos_infinity(dtype):
     """Return an appropriate positive infinity for this dtype.
 

From 515b9c19a0b0435d81cd8bc0594672c14dc36680 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Thu, 23 Jan 2020 19:51:41 -0800
Subject: [PATCH 10/13] refactor variable order code to its own internal method

---
 xarray/core/concat.py | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 089253dc59e..f802e3da2c1 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -380,20 +380,8 @@ def ensure_common_dims(vars):
     union_of_coordinates = OrderedDict()
     for ds in datasets:
         var_list = list(ds.variables.keys())
-        # this logic maintains the order of the variable list and runs in
-        # O(n^2) where n is number of variables in the uncommon worst case
-        # where there are no missing variables this will be O(n)
-        for i in range(0, len(var_list)):
-            if var_list[i] not in union_of_variables:
-                # need to determine the correct place
-                # first add the new item which will be at the end
-                union_of_variables[var_list[i]] = None
-                union_of_variables.move_to_end(var_list[i])
-                # move any items after this in the variables list to the end
-                # this will only happen for missing variables
-                for j in range(i + 1, len(var_list)):
-                    if var_list[j] in union_of_variables:
-                        union_of_variables.move_to_end(var_list[j])
+
+        _find_ordering_inplace(var_list, union_of_variables)
 
         # check that all datasets have the same coordinate set
         if len(union_of_coordinates) > 0:
@@ -406,8 +394,8 @@ def ensure_common_dims(vars):
                     % coord_set_diff
                 )
 
-        union_of_coordinates = dict(
-            union_of_coordinates.items() | dict.fromkeys(ds.coords).items()
+        union_of_coordinates = OrderedDict(
+            union_of_coordinates.items() | OrderedDict.fromkeys(ds.coords).items()
         )
 
     # we don't want to fill coordinate variables so remove them
@@ -477,6 +465,26 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets):
     return result
 
 
+def _find_ordering_inplace(l, union):
+    # this logic maintains the order of the variable list and runs in
+    # O(n^2) where n is number of variables in the uncommon worst case
+    # where there are no missing variables this will be O(n)
+    # could potentially be refactored to a more generic function to determine
+    # a consistent ordering of variables if proper consideration were
+    # given both to the runtime as well as to the user scenarios
+    for i in range(0, len(l)):
+        if l[i] not in union:
+            # need to determine the correct place
+            # first add the new item which will be at the end
+            union[l[i]] = None
+            union.move_to_end(l[i])
+            # move any items after this in the variables list to the end
+            # this will only happen for missing variables
+            for j in range(i + 1, len(l)):
+                if l[j] in union:
+                    union.move_to_end(l[j])
+
+
 def _dataarray_concat(
     arrays,
     dim,

From cf5b8bdfb0fdaf626ecf7b83590f15aa9aef1d6b Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Fri, 24 Jan 2020 14:02:03 -0800
Subject: [PATCH 11/13] Add test for multiple consecutive missing variables

---
 xarray/tests/test_concat.py | 94 +++++++++++++++++++++++++++++++++++--
 1 file changed, 91 insertions(+), 3 deletions(-)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 81511c2579f..752f51ba773 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -140,6 +140,78 @@ def test_concat_missing_var():
     )
     result = concat(datasets, dim="day")
 
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
+
+    assert_equal(result, ds_result)
+
+
+def test_concat_missing_muliple_consecutive_var():
+    datasets = create_concat_datasets(3, 123)
+    vars_to_drop = ["pressure", "humidity"]
+    datasets[0] = datasets[0].drop_vars(vars_to_drop)
+    datasets[1] = datasets[1].drop_vars(vars_to_drop)
+
+    temperature_result = np.concatenate(
+        (
+            datasets[0].temperature.values,
+            datasets[1].temperature.values,
+            datasets[2].temperature.values,
+        ),
+        axis=2,
+    )
+    pressure_result = np.concatenate(
+        (
+            np.full([1, 4, 2], np.nan),
+            np.full([1, 4, 2], np.nan),
+            datasets[2].pressure.values,
+        ),
+        axis=2,
+    )
+    humidity_result = np.concatenate(
+        (
+            np.full([1, 4, 2], np.nan),
+            np.full([1, 4, 2], np.nan),
+            datasets[2].humidity.values,
+        ),
+        axis=2,
+    )
+    precipitation_result = np.concatenate(
+        (
+            datasets[0].precipitation.values,
+            datasets[1].precipitation.values,
+            datasets[2].precipitation.values,
+        ),
+        axis=2,
+    )
+    cloudcover_result = np.concatenate(
+        (
+            datasets[0]["cloud cover"].values,
+            datasets[1]["cloud cover"].values,
+            datasets[2]["cloud cover"].values,
+        ),
+        axis=2,
+    )
+
+    ds_result = Dataset(
+        data_vars={
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+            "humidity": (["x", "y", "day"], humidity_result),
+            "precipitation": (["x", "y", "day"], precipitation_result),
+            "cloud cover": (["x", "y", "day"], cloudcover_result),
+        },
+        coords={
+            "lat": (["x", "y"], datasets[0].lat.values),
+            "lon": (["x", "y"], datasets[0].lon.values),
+            "day": ["day1", "day2", "day3", "day4", "day5", "day6"],
+        },
+    )
+    result = concat(datasets, dim="day")
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
     assert_equal(result, ds_result)
 
 
@@ -197,6 +269,10 @@ def test_multiple_missing_variables():
     )
     result = concat(datasets, dim="day")
 
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
+
     assert_equal(result, ds_result)
 
 
@@ -241,8 +317,10 @@ def test_multiple_datasets_with_missing_variables():
 
     ds_result = Dataset(
         data_vars={
-            "temperature": (["x", "y", "day"], result_vars["temperature"]),
+            # pressure will be first in this since the first dataset is missing this var
+            # and there isn't a good way to determine that this should be first
             "pressure": (["x", "y", "day"], result_vars["pressure"]),
+            "temperature": (["x", "y", "day"], result_vars["temperature"]),
             "humidity": (["x", "y", "day"], result_vars["humidity"]),
             "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
             "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
@@ -255,6 +333,10 @@ def test_multiple_datasets_with_missing_variables():
     )
     result = concat(datasets, dim="day")
 
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
+
     assert_equal(result, ds_result)
 
 
@@ -283,11 +365,13 @@ def test_multiple_datasets_with_multiple_missing_variables():
     )
     ds_result = Dataset(
         data_vars={
-            "temperature": (["x", "y", "day"], temperature_result),
-            "pressure": (["x", "y", "day"], pressure_result),
             "humidity": (["x", "y", "day"], humidity_result),
             "precipitation": (["x", "y", "day"], precipitation_result),
             "cloud cover": (["x", "y", "day"], cloudcover_result),
+            # these to are at the end of the expected as they are missing from the first
+            # dataset in the concat list
+            "temperature": (["x", "y", "day"], temperature_result),
+            "pressure": (["x", "y", "day"], pressure_result),
         },
         coords={
             "lat": (["x", "y"], datasets[0].lat.values),
@@ -297,6 +381,10 @@ def test_multiple_datasets_with_multiple_missing_variables():
     )
     result = concat(datasets, dim="day")
 
+    r1 = list(result.data_vars.keys())
+    r2 = list(ds_result.data_vars.keys())
+    assert r1 == r2  # check the variables orders are the same
+
     assert_equal(result, ds_result)
 
 

From 3bf393170ae96230f63621cf720102468edaf097 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Sat, 15 Feb 2020 10:30:29 -0800
Subject: [PATCH 12/13] Simplify variable ordering in concat for missing
 variable case

---
 xarray/core/concat.py       |  44 +++++-------
 xarray/tests/test_concat.py | 139 ++++++++++++++++++++++++++++++------
 2 files changed, 135 insertions(+), 48 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index f802e3da2c1..961e3b8e727 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -265,21 +265,21 @@ def _parse_datasets(datasets):
 
     dims = set()
     all_coord_names = set()
-    data_vars = set()  # list of data_vars
+    data_vars = {}  # list of data_vars, using dict internally to maintain order
     dim_coords = {}  # maps dim name to variable
     dims_sizes = {}  # shared dimension sizes to expand variables
 
     for ds in datasets:
         dims_sizes.update(ds.dims)
         all_coord_names.update(ds.coords)
-        data_vars.update(ds.data_vars)
+        data_vars.update(dict.fromkeys(ds.data_vars))
 
         for dim in set(ds.dims) - dims:
             if dim not in dim_coords:
                 dim_coords[dim] = ds.coords[dim].variable
         dims = dims | set(ds.dims)
 
-    return dim_coords, dims_sizes, all_coord_names, data_vars
+    return dim_coords, dims_sizes, all_coord_names, list(data_vars.keys())
 
 
 def _dataset_concat(
@@ -308,7 +308,7 @@ def _dataset_concat(
     dim_names = set(dim_coords)
     unlabeled_dims = dim_names - coord_names
 
-    both_data_and_coords = coord_names & data_names
+    both_data_and_coords = coord_names & set(data_names)
     if both_data_and_coords:
         raise ValueError(
             "%r is a coordinate in some datasets but not others." % both_data_and_coords
@@ -327,7 +327,7 @@ def _dataset_concat(
     )
 
     # determine which variables to merge, and then merge them according to compat
-    variables_to_merge = (coord_names | data_names) - concat_over - dim_names
+    variables_to_merge = (coord_names | set(data_names)) - concat_over - dim_names
 
     result_vars = {}
     if variables_to_merge:
@@ -376,27 +376,11 @@ def ensure_common_dims(vars):
     # if datasets have variables in drastically different orders
     # the resulting order will be dependent on the order they are in the list
     # passed to concat
-    union_of_variables = OrderedDict()
-    union_of_coordinates = OrderedDict()
-    for ds in datasets:
-        var_list = list(ds.variables.keys())
-
-        _find_ordering_inplace(var_list, union_of_variables)
-
-        # check that all datasets have the same coordinate set
-        if len(union_of_coordinates) > 0:
-            coord_set_diff = (
-                union_of_coordinates.keys() ^ ds.coords.keys()
-            ) & concat_over
-            if len(coord_set_diff) > 0:
-                raise ValueError(
-                    "Variables %r are coordinates in some datasets but not others."
-                    % coord_set_diff
-                )
+    data_var_order = list(datasets[0].data_vars)
+    data_var_order += [e for e in data_names if e not in data_var_order]
 
-        union_of_coordinates = OrderedDict(
-            union_of_coordinates.items() | OrderedDict.fromkeys(ds.coords).items()
-        )
+    union_of_variables = OrderedDict.fromkeys(data_var_order)
+    union_of_coordinates = OrderedDict.fromkeys(coord_names)
 
     # we don't want to fill coordinate variables so remove them
     for k in union_of_coordinates.keys():
@@ -422,7 +406,7 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets):
             )
 
             union_of_variables[variable_key] = full_like(
-                ds[variable_key], fill_value=v_fill_value, dtype=dtype
+                ds.variables[variable_key], fill_value=v_fill_value, dtype=dtype
             )
             return
 
@@ -430,12 +414,14 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets):
         find_fill_variable_from_ds(v, union_of_variables, datasets)
 
     # create the concat list filling in missing variables
+    filling_coordinates = False
     while len(union_of_variables) > 0 or len(union_of_coordinates) > 0:
         k = None
         # get the variables in order
         if len(union_of_variables) > 0:
             k = union_of_variables.popitem(last=False)
         elif len(union_of_coordinates) > 0:
+            filling_coordinates = True
             k = union_of_coordinates.popitem()
 
         if k[0] in concat_over:
@@ -444,6 +430,12 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets):
                 if k[0] in ds.variables:
                     variables.append(ds.variables[k[0]])
                 else:
+                    if filling_coordinates:
+                        # in this case the coordinate is missing from a dataset
+                        raise ValueError(
+                            "Variables %r are coordinates in some datasets but not others."
+                            % k[0]
+                        )
                     # var is missing, fill with cached value
                     variables.append(k[1])
 
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 752f51ba773..45e2250b3f6 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -20,28 +20,43 @@
 
 
 # helper method to create multiple tests datasets to concat
-def create_concat_datasets(num_datasets=2, seed=None):
+def create_concat_datasets(num_datasets=2, seed=None, include_day=True):
     random.seed(seed)
     result = []
     lat = np.random.randn(1, 4)
     lon = np.random.randn(1, 4)
     for i in range(num_datasets):
-        result.append(
-            Dataset(
-                data_vars={
-                    "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                    "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                    "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                    "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                    "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)),
-                },
-                coords={
-                    "lat": (["x", "y"], lat),
-                    "lon": (["x", "y"], lon),
-                    "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
-                },
+        if include_day:
+            result.append(
+                Dataset(
+                    data_vars={
+                        "temperature": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                        "pressure": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                        "humidity": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                        "precipitation": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                        "cloud cover": (["x", "y", "day"], np.random.randn(1, 4, 2)),
+                    },
+                    coords={
+                        "lat": (["x", "y"], lat),
+                        "lon": (["x", "y"], lon),
+                        "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
+                    },
+                )
             )
-        )
+        else:
+            result.append(
+                Dataset(
+                    data_vars={
+                        "temperature": (["x", "y"], np.random.randn(1, 4)),
+                        "pressure": (["x", "y"], np.random.randn(1, 4)),
+                        "humidity": (["x", "y"], np.random.randn(1, 4)),
+                        "precipitation": (["x", "y"], np.random.randn(1, 4)),
+                        "cloud cover": (["x", "y"], np.random.randn(1, 4)),
+                    },
+                    coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)},
+                )
+            )
+
     return result
 
 
@@ -197,10 +212,10 @@ def test_concat_missing_muliple_consecutive_var():
     ds_result = Dataset(
         data_vars={
             "temperature": (["x", "y", "day"], temperature_result),
-            "pressure": (["x", "y", "day"], pressure_result),
-            "humidity": (["x", "y", "day"], humidity_result),
             "precipitation": (["x", "y", "day"], precipitation_result),
             "cloud cover": (["x", "y", "day"], cloudcover_result),
+            "pressure": (["x", "y", "day"], pressure_result),
+            "humidity": (["x", "y", "day"], humidity_result),
         },
         coords={
             "lat": (["x", "y"], datasets[0].lat.values),
@@ -276,6 +291,77 @@ def test_multiple_missing_variables():
     assert_equal(result, ds_result)
 
 
+def test_concat_multiple_datasets_missing_vars_and_new_dim():
+    vars_to_drop = [
+        "temperature",
+        "pressure",
+        "humidity",
+        "precipitation",
+        "cloud cover",
+    ]
+    datasets = create_concat_datasets(len(vars_to_drop), 123, include_day=False)
+    # set up the test data
+    datasets = [datasets[i].drop_vars(vars_to_drop[i]) for i in range(len(datasets))]
+
+    # set up the validation data
+    # the below code just drops one var per dataset depending on the location of the
+    # dataset in the list and allows us to quickly catch any boundaries cases across
+    # the three equivalence classes of beginning, middle and end of the concat list
+    result_vars = dict.fromkeys(vars_to_drop)
+    for i in range(len(vars_to_drop)):
+        for d in range(len(datasets)):
+            if d != i:
+                if result_vars[vars_to_drop[i]] is None:
+                    result_vars[vars_to_drop[i]] = datasets[d][vars_to_drop[i]].values
+                else:
+                    result_vars[vars_to_drop[i]] = np.concatenate(
+                        (
+                            result_vars[vars_to_drop[i]],
+                            datasets[d][vars_to_drop[i]].values,
+                        ),
+                        axis=1,
+                    )
+            else:
+                if result_vars[vars_to_drop[i]] is None:
+                    result_vars[vars_to_drop[i]] = np.full([1, 4], np.nan)
+                else:
+                    result_vars[vars_to_drop[i]] = np.concatenate(
+                        (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)), axis=1,
+                    )
+    # TODO: this test still has two unexpected errors:
+    # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case
+    #   as we are concating on new dims
+    # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable
+    #   the resulting set
+
+    # ds_result = Dataset(
+    #     data_vars={
+    #         # pressure will be first in this since the first dataset is missing this var
+    #         # and there isn't a good way to determine that this should be first
+    #         #this also means temperature will be last as the first data vars will
+    #         #determine the order for all that exist in that dataset
+    #         "pressure": (["x", "y", "day"], result_vars["pressure"]),
+    #         "humidity": (["x", "y", "day"], result_vars["humidity"]),
+    #         "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
+    #         "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
+    #         "temperature": (["x", "y", "day"], result_vars["temperature"]),
+    #     },
+    #     coords={
+    #         "lat": (["x", "y"], datasets[0].lat.values),
+    #         "lon": (["x", "y"], datasets[0].lon.values),
+    #       #  "day": ["day" + str(d + 1) for d in range(2 * len(vars_to_drop))],
+    #     },
+    # )
+
+    # result = concat(datasets, dim="day")
+
+    # r1 = list(result.data_vars.keys())
+    # r2 = list(ds_result.data_vars.keys())
+    # assert r1 == r2  # check the variables orders are the same
+
+    # assert_equal(result, ds_result)
+
+
 def test_multiple_datasets_with_missing_variables():
     vars_to_drop = [
         "temperature",
@@ -319,11 +405,13 @@ def test_multiple_datasets_with_missing_variables():
         data_vars={
             # pressure will be first in this since the first dataset is missing this var
             # and there isn't a good way to determine that this should be first
+            # this also means temperature will be last as the first data vars will
+            # determine the order for all that exist in that dataset
             "pressure": (["x", "y", "day"], result_vars["pressure"]),
-            "temperature": (["x", "y", "day"], result_vars["temperature"]),
             "humidity": (["x", "y", "day"], result_vars["humidity"]),
             "precipitation": (["x", "y", "day"], result_vars["precipitation"]),
             "cloud cover": (["x", "y", "day"], result_vars["cloud cover"]),
+            "temperature": (["x", "y", "day"], result_vars["temperature"]),
         },
         coords={
             "lat": (["x", "y"], datasets[0].lat.values),
@@ -368,7 +456,7 @@ def test_multiple_datasets_with_multiple_missing_variables():
             "humidity": (["x", "y", "day"], humidity_result),
             "precipitation": (["x", "y", "day"], precipitation_result),
             "cloud cover": (["x", "y", "day"], cloudcover_result),
-            # these to are at the end of the expected as they are missing from the first
+            # these two are at the end of the expected as they are missing from the first
             # dataset in the concat list
             "temperature": (["x", "y", "day"], temperature_result),
             "pressure": (["x", "y", "day"], pressure_result),
@@ -508,9 +596,9 @@ def test_order_when_filling_missing():
         data_vars={
             "temperature": (["x", "y", "day"], temperature_result),
             "pressure": (["x", "y", "day"], pressure_result),
-            "humidity": (["x", "y", "day"], humidity_result),
             "precipitation": (["x", "y", "day"], precipitation_result),
             "cloud cover": (["x", "y", "day"], cloudcover_result),
+            "humidity": (["x", "y", "day"], humidity_result),
         },
         coords={
             "lat": (["x", "y"], datasets[0].lat.values),
@@ -534,11 +622,18 @@ def test_order_when_filling_missing():
         assert k == result_keys[result_index]
         result_index += 1
 
+    result_keys_rev = [
+        "temperature",
+        "pressure",
+        "precipitation",
+        "cloud cover",
+        "humidity",
+    ]
     # test order when concat in reversed order
     rev_result = concat(datasets[::-1], dim="day")
     result_index = 0
     for k in rev_result.data_vars.keys():
-        assert k == result_keys[result_index]
+        assert k == result_keys_rev[result_index]
         result_index += 1
 
 

From 03f9b3b85aee039f47dd693322492ab09f57fb73 Mon Sep 17 00:00:00 2001
From: Scott Chamberlin <scottcha@live.com>
Date: Sat, 15 Feb 2020 10:44:25 -0800
Subject: [PATCH 13/13] remove unused helper method

---
 xarray/core/concat.py       | 20 --------------------
 xarray/tests/test_concat.py |  3 ++-
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
index 961e3b8e727..82bfc0ea4d7 100644
--- a/xarray/core/concat.py
+++ b/xarray/core/concat.py
@@ -457,26 +457,6 @@ def find_fill_variable_from_ds(variable_key, union_of_variables, datasets):
     return result
 
 
-def _find_ordering_inplace(l, union):
-    # this logic maintains the order of the variable list and runs in
-    # O(n^2) where n is number of variables in the uncommon worst case
-    # where there are no missing variables this will be O(n)
-    # could potentially be refactored to a more generic function to determine
-    # a consistent ordering of variables if proper consideration were
-    # given both to the runtime as well as to the user scenarios
-    for i in range(0, len(l)):
-        if l[i] not in union:
-            # need to determine the correct place
-            # first add the new item which will be at the end
-            union[l[i]] = None
-            union.move_to_end(l[i])
-            # move any items after this in the variables list to the end
-            # this will only happen for missing variables
-            for j in range(i + 1, len(l)):
-                if l[j] in union:
-                    union.move_to_end(l[j])
-
-
 def _dataarray_concat(
     arrays,
     dim,
diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
index 45e2250b3f6..f52c729a13e 100644
--- a/xarray/tests/test_concat.py
+++ b/xarray/tests/test_concat.py
@@ -291,6 +291,7 @@ def test_multiple_missing_variables():
     assert_equal(result, ds_result)
 
 
+@pytest.mark.xfail(strict=True)
 def test_concat_multiple_datasets_missing_vars_and_new_dim():
     vars_to_drop = [
         "temperature",
@@ -329,6 +330,7 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim():
                         (result_vars[vars_to_drop[i]], np.full([1, 4], np.nan)), axis=1,
                     )
     # TODO: this test still has two unexpected errors:
+
     # 1: concat throws a mergeerror expecting the temperature values to be the same, this doesn't seem to be correct in this case
     #   as we are concating on new dims
     # 2: if the values are the same for a variable (working around #1) then it will likely not correct add the new dim to the first variable
@@ -354,7 +356,6 @@ def test_concat_multiple_datasets_missing_vars_and_new_dim():
     # )
 
     # result = concat(datasets, dim="day")
-
     # r1 = list(result.data_vars.keys())
     # r2 = list(ds_result.data_vars.keys())
     # assert r1 == r2  # check the variables orders are the same