Skip to content

Commit

Permalink
add a drop_conflicts strategy for merging attrs (#4827)
Browse files Browse the repository at this point in the history
  • Loading branch information
keewis authored Feb 10, 2021
1 parent 59088a0 commit 47889ee
Show file tree
Hide file tree
Showing 7 changed files with 247 additions and 29 deletions.
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ New Features
By `Maximilian Roos <https://github.com/max-sixty>`_.

- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
By `Deepak Cherian <https://github.com/dcherian>`_
- add ``"drop_conflicts"`` to the strategies supported by the ``combine_attrs`` kwarg
(:issue:`4749`, :pull:`4827`).
By `Justus Magin <https://github.com/keewis>`_.
By `Deepak Cherian <https://github.com/dcherian>`_.
- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims
in the form of kwargs as well as a dict, like most similar methods.
Expand Down
12 changes: 8 additions & 4 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,14 +412,16 @@ def combine_nested(
- "override": if indexes are of same size, rewrite indexes to be
those of the first object with that dimension. Indexes for the same
dimension must have the same size in all objects.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
default: "drop"
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, default: "drop"
String indicating how to combine attrs of the objects being merged:
- "drop": empty attrs on returned Dataset.
- "identical": all attrs must be the same on every object.
- "no_conflicts": attrs from all objects are combined, any that have
the same name must also have the same value.
- "drop_conflicts": attrs from all objects are combined, any that have
the same name but different values are dropped.
- "override": skip comparing and copy attrs from the first dataset to
the result.
Expand Down Expand Up @@ -625,14 +627,16 @@ def combine_by_coords(
- "override": if indexes are of same size, rewrite indexes to be
those of the first object with that dimension. Indexes for the same
dimension must have the same size in all objects.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
default: "drop"
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, default: "drop"
String indicating how to combine attrs of the objects being merged:
- "drop": empty attrs on returned Dataset.
- "identical": all attrs must be the same on every object.
- "no_conflicts": attrs from all objects are combined, any that have
the same name must also have the same value.
- "drop_conflicts": attrs from all objects are combined, any that have
the same name but different values are dropped.
- "override": skip comparing and copy attrs from the first dataset to
the result.
Expand Down
6 changes: 4 additions & 2 deletions xarray/core/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,16 @@ def concat(
- "override": if indexes are of same size, rewrite indexes to be
those of the first object with that dimension. Indexes for the same
dimension must have the same size in all objects.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
default: "override"
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, default: "override"
String indicating how to combine attrs of the objects being merged:
- "drop": empty attrs on returned Dataset.
- "identical": all attrs must be the same on every object.
- "no_conflicts": attrs from all objects are combined, any that have
the same name must also have the same value.
- "drop_conflicts": attrs from all objects are combined, any that have
the same name but different values are dropped.
- "override": skip comparing and copy attrs from the first dataset to
the result.
Expand Down
29 changes: 25 additions & 4 deletions xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from . import dtypes, pdcompat
from .alignment import deep_align
from .duck_array_ops import lazy_array_equiv
from .utils import Frozen, compat_dict_union, dict_equiv
from .utils import Frozen, compat_dict_union, dict_equiv, equivalent
from .variable import Variable, as_variable, assert_unique_multiindex_level_names

if TYPE_CHECKING:
Expand Down Expand Up @@ -513,6 +513,24 @@ def merge_attrs(variable_attrs, combine_attrs):
"the same. Merging %s with %s" % (str(result), str(attrs))
)
return result
elif combine_attrs == "drop_conflicts":
result = {}
dropped_keys = set()
for attrs in variable_attrs:
result.update(
{
key: value
for key, value in attrs.items()
if key not in result and key not in dropped_keys
}
)
result = {
key: value
for key, value in result.items()
if key not in attrs or equivalent(attrs[key], value)
}
dropped_keys |= {key for key in attrs if key not in result}
return result
elif combine_attrs == "identical":
result = dict(variable_attrs[0])
for attrs in variable_attrs[1:]:
Expand Down Expand Up @@ -556,7 +574,8 @@ def merge_core(
Compatibility checks to use when merging variables.
join : {"outer", "inner", "left", "right"}, optional
How to combine objects with different indexes.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, optional
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, optional
How to combine attributes of objects
priority_arg : int, optional
Optional argument in `objects` that takes precedence over the others.
Expand Down Expand Up @@ -668,14 +687,16 @@ def merge(
Value to use for newly missing values. If a dict-like, maps
variable names to fill values. Use a data array's name to
refer to its values.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
default: "drop"
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, default: "drop"
String indicating how to combine attrs of the objects being merged:
- "drop": empty attrs on returned Dataset.
- "identical": all attrs must be the same on every object.
- "no_conflicts": attrs from all objects are combined, any that have
the same name must also have the same value.
- "drop_conflicts": attrs from all objects are combined, any that have
the same name but different values are dropped.
- "override": skip comparing and copy attrs from the first dataset to
the result.
Expand Down
11 changes: 11 additions & 0 deletions xarray/tests/test_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,17 @@ def test_combine_coords_combine_attrs_identical(self):
objs, concat_dim="x", join="outer", combine_attrs="identical"
)

def test_combine_nested_combine_attrs_drop_conflicts(self):
objs = [
Dataset({"x": [0], "y": [0]}, attrs={"a": 1, "b": 2, "c": 3}),
Dataset({"x": [1], "y": [1]}, attrs={"a": 1, "b": 0, "d": 3}),
]
expected = Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1, "c": 3, "d": 3})
actual = combine_nested(
objs, concat_dim="x", join="outer", combine_attrs="drop_conflicts"
)
assert_identical(expected, actual)

def test_infer_order_from_coords(self):
data = create_test_data()
objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))]
Expand Down
129 changes: 110 additions & 19 deletions xarray/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,27 +258,118 @@ def test_concat_join_kwarg(self):
)
assert_identical(actual, expected)

def test_concat_combine_attrs_kwarg(self):
ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs={"b": 42})
ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs={"b": 42, "c": 43})

expected = {}
expected["drop"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]})
expected["no_conflicts"] = Dataset(
{"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42, "c": 43}
)
expected["override"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42})

with raises_regex(ValueError, "combine_attrs='identical'"):
actual = concat([ds1, ds2], dim="x", combine_attrs="identical")
with raises_regex(ValueError, "combine_attrs='no_conflicts'"):
ds3 = ds2.copy(deep=True)
ds3.attrs["b"] = 44
actual = concat([ds1, ds3], dim="x", combine_attrs="no_conflicts")
@pytest.mark.parametrize(
"combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 41, "b": 42, "c": 43},
{"b": 2, "c": 43, "d": 44},
{"a": 41, "c": 43, "d": 44},
False,
),
],
)
def test_concat_combine_attrs_kwarg(
self, combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception
):
ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs=var1_attrs)
ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs=var2_attrs)

if expect_exception:
with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"):
concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
else:
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
expected = Dataset(
{"a": ("x", [0, 0])}, {"x": [0, 1]}, attrs=expected_attrs
)

for combine_attrs in expected:
assert_identical(actual, expected)

@pytest.mark.skip(reason="not implemented, yet (see #4827)")
@pytest.mark.parametrize(
"combine_attrs, attrs1, attrs2, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 41, "b": 42, "c": 43},
{"b": 2, "c": 43, "d": 44},
{"a": 41, "c": 43, "d": 44},
False,
),
],
)
def test_concat_combine_attrs_kwarg_variables(
self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception
):
"""check that combine_attrs is used on data variables and coords"""
ds1 = Dataset({"a": ("x", [0], attrs1)}, coords={"x": ("x", [0], attrs1)})
ds2 = Dataset({"a": ("x", [0], attrs2)}, coords={"x": ("x", [1], attrs2)})

if expect_exception:
with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"):
concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
else:
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
assert_identical(actual, expected[combine_attrs])
expected = Dataset(
{"a": ("x", [0, 0], expected_attrs)},
{"x": ("x", [0, 1], expected_attrs)},
)

assert_identical(actual, expected)

def test_concat_promote_shape(self):
# mixed dims within variables
Expand Down
85 changes: 85 additions & 0 deletions xarray/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,20 @@ def test_merge_arrays_attrs_default(self):
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 1, "b": 2, "c": 3},
{"b": 1, "c": 3, "d": 4},
{"a": 1, "c": 3, "d": 4},
False,
),
(
"drop_conflicts",
{"a": 1, "b": np.array([2]), "c": np.array([3])},
{"b": 1, "c": np.array([3]), "d": 4},
{"a": 1, "c": np.array([3]), "d": 4},
False,
),
],
)
def test_merge_arrays_attrs(
Expand All @@ -109,13 +123,84 @@ def test_merge_arrays_attrs(
expected.attrs = expected_attrs
assert_identical(actual, expected)

@pytest.mark.skip(reason="not implemented, yet (see #4827)")
@pytest.mark.parametrize(
"combine_attrs, attrs1, attrs2, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 1, "b": 2, "c": 3},
{"b": 1, "c": 3, "d": 4},
{"a": 1, "c": 3, "d": 4},
False,
),
],
)
def test_merge_arrays_attrs_variables(
self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception
):
"""check that combine_attrs is used on data variables and coords"""
data = create_test_data()
data1 = data.copy()
data1.var1.attrs = attrs1
data1.dim1.attrs = attrs1
data2 = data.copy()
data2.var1.attrs = attrs2
data2.dim1.attrs = attrs2

if expect_exception:
with raises_regex(MergeError, "combine_attrs"):
actual = xr.merge([data1, data2], combine_attrs=combine_attrs)
else:
actual = xr.merge([data1, data2], combine_attrs=combine_attrs)
expected = data.copy()
expected.var1.attrs = expected_attrs
expected.dim1.attrs = expected_attrs

assert_identical(actual, expected)

def test_merge_attrs_override_copy(self):
ds1 = xr.Dataset(attrs={"x": 0})
ds2 = xr.Dataset(attrs={"x": 1})
ds3 = xr.merge([ds1, ds2], combine_attrs="override")
ds3.attrs["x"] = 2
assert ds1.x == 0

def test_merge_attrs_drop_conflicts(self):
ds1 = xr.Dataset(attrs={"a": 0, "b": 0, "c": 0})
ds2 = xr.Dataset(attrs={"b": 0, "c": 1, "d": 0})
ds3 = xr.Dataset(attrs={"a": 0, "b": 1, "c": 0, "e": 0})

actual = xr.merge([ds1, ds2, ds3], combine_attrs="drop_conflicts")
expected = xr.Dataset(attrs={"a": 0, "d": 0, "e": 0})
assert_identical(actual, expected)

def test_merge_dicts_simple(self):
actual = xr.merge([{"foo": 0}, {"bar": "one"}, {"baz": 3.5}])
expected = xr.Dataset({"foo": 0, "bar": "one", "baz": 3.5})
Expand Down

0 comments on commit 47889ee

Please sign in to comment.