From 88ee12a453af5dce948d2850b215b9d3c00e1cd0 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 5 Nov 2018 20:09:13 +0000 Subject: [PATCH 01/96] concatenates along a single dimension --- xarray/core/combine.py | 51 ++++++++++++++++++++++++++++++++++++ xarray/tests/test_combine.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 6853939c02d..ded6430f4aa 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function import warnings +import toolz.itertoolz as itertoolz import pandas as pd @@ -369,6 +370,56 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' +def _concat_nd(combined_IDs, concat_dims): + """ + Recursively concatenates an N-dimensional structure of datasets. + + No checks are performed on the consistency of the datasets, concat_dims or tile_IDs, + because it is assumed that this has already been done. + + Parameters + ---------- + combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset] + Structure containing all datasets to be concatenated with "tile_IDs" as keys, which + specify position within the desired final concatenated result. + concat_dims : sequence of str + + Returns + ------- + + """ + + for dim in concat_dims: + combined_IDs = _concat_all_along_last_dim(combined_IDs, dim) + + return combined_IDs.item + + +def _concat_all_along_last_dim(combined_IDs, dim): + + grouped = itertoolz.groupby(_rest_of_tile_id, combined_IDs.items()) + + new_combined_IDs = {} + for new_ID, group in grouped.items(): + print(new_ID) + print(group) + to_concat = [ds for old_ID, ds in group] + print(to_concat) + + new_combined_IDs[new_ID] = concat(to_concat, dim) + + return new_combined_IDs + + +def _rest_of_tile_id(single_id_ds_pair): + + # probably replace with something like lambda x: x[0][1:] + + tile_id, ds = single_id_ds_pair + tile_id_except_first_element = tile_id[1:] + return tile_id_except_first_element + + def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 2004b1e660f..7edd00dced5 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function from copy import deepcopy +from itertools import product import numpy as np import pandas as pd @@ -8,6 +9,7 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems +from xarray.core.combine import _rest_of_tile_id, _concat_all_along_last_dim from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -396,3 +398,41 @@ def test_auto_combine_no_concat(self): 'y': (('baz', 'z'), [[1, 2]])}, {'baz': [100]}) assert_identical(expected, actual) + + +@pytest.fixture(scope='module') +def create_combined_ids(): + return _create_combined_ids + + +def _create_combined_ids(shape): + tile_ids = _create_tile_ids(shape) + return {tile_id: create_test_data(0) for tile_id in tile_ids} + + +def _create_tile_ids(shape): + tile_ids = product(*(range(i) for i in shape)) + return list(tile_ids) + + +class TestConcatND(object): + def test_get_tile_ids(self, create_combined_ids): + shape = (1, 2, 3) + combined_ids = _create_combined_ids(shape) + print(combined_ids.keys()) + + for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): + expected_new_tile_id = tile_id[1:] + assert _rest_of_tile_id(combined) == expected_new_tile_id + + def test_concat_once(self, create_combined_ids): + shape = (2,) + combined_ids = _create_combined_ids(shape) + print(combined_ids) + + result = _concat_all_along_last_dim(combined_ids, 'dim1') + print('-------------------') + print(result[()]) + expected_ds = concat([create_test_data(0), create_test_data(0)], 'dim1') + print(expected_ds) + assert_equal(result[()], expected_ds) From 1aaa0756f6b4d0e865744ab94bf39b55ec5aab02 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 6 Nov 2018 12:42:34 +0000 Subject: [PATCH 02/96] Wrote function to find correct tile_IDs from nested list of datasets --- xarray/core/combine.py | 81 +++++++++++++++++++++++++++--------- xarray/testing.py | 8 ++++ xarray/tests/__init__.py | 2 +- xarray/tests/test_combine.py | 64 ++++++++++++++++++++++++++-- 4 files changed, 130 insertions(+), 25 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index ded6430f4aa..d6e2aef6456 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -367,21 +367,69 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) -_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' +def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): + """ + Given a list of lists (of lists...) of datasets, returns a dictionary + with the index of each dataset in the nested list structure as the key. + + Recursively traverses the given structure, while keeping track of the current + position. + + Parameters + ---------- + entry : list[list[xarray.Dataset, xarray.Dataset, ...]] + List of lists of arbitrary depth, containing datasets in the order they + are to be concatenated. + + Returns + ------- + combined_tile_ids : dict[tuple(int, ...), xarray.Dataset] + """ + + from .dataset import Dataset + + if isinstance(entry, list): + # Check if list is redundant + if len(entry) == 1: + raise TypeError('Redundant list nesting at ' + 'position ' + str(current_pos)) + + # Dive down tree + current_pos.append(0) + for i, item in enumerate(entry): + current_pos[-1] = i + combined_tile_ids = _infer_tile_ids_from_nested_list(item, current_pos, + combined_tile_ids) + # Move back up tree + del current_pos[-1] + return combined_tile_ids + + elif isinstance(entry, Dataset): + # Termination condition + combined_tile_ids[tuple(current_pos)] = entry + return combined_tile_ids + + else: + raise TypeError("Element at position " + str(current_pos) + + " is neither a list nor an xarray.Dataset") + + +def _check_shape_tile_ids(combined_tile_ids): + ... def _concat_nd(combined_IDs, concat_dims): """ Recursively concatenates an N-dimensional structure of datasets. - No checks are performed on the consistency of the datasets, concat_dims or tile_IDs, - because it is assumed that this has already been done. + No checks are performed on the consistency of the datasets, concat_dims or + tile_IDs, because it is assumed that this has already been done. Parameters ---------- combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset] - Structure containing all datasets to be concatenated with "tile_IDs" as keys, which - specify position within the desired final concatenated result. + Structure containing all datasets to be concatenated with "tile_IDs" as + keys, which specify position within the desired final concatenated result. concat_dims : sequence of str Returns @@ -390,34 +438,27 @@ def _concat_nd(combined_IDs, concat_dims): """ for dim in concat_dims: - combined_IDs = _concat_all_along_last_dim(combined_IDs, dim) + combined_IDs = _concat_all_along_first_dim(combined_IDs, dim) return combined_IDs.item -def _concat_all_along_last_dim(combined_IDs, dim): - - grouped = itertoolz.groupby(_rest_of_tile_id, combined_IDs.items()) - +def _concat_all_along_first_dim(combined_IDs, dim): + grouped = itertoolz.groupby(_tile_id_except_first_element, combined_IDs.items()) new_combined_IDs = {} for new_ID, group in grouped.items(): - print(new_ID) - print(group) to_concat = [ds for old_ID, ds in group] - print(to_concat) - new_combined_IDs[new_ID] = concat(to_concat, dim) - return new_combined_IDs -def _rest_of_tile_id(single_id_ds_pair): - +def _tile_id_except_first_element(single_id_ds_pair): # probably replace with something like lambda x: x[0][1:] - tile_id, ds = single_id_ds_pair - tile_id_except_first_element = tile_id[1:] - return tile_id_except_first_element + return tile_id[1:] + + +_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' def auto_combine(datasets, diff --git a/xarray/testing.py b/xarray/testing.py index ee5a54cd7dc..03c5354cdff 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -138,3 +138,11 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): else: raise TypeError('{} not supported by assertion comparison' .format(type(a))) + + +def assert_combined_tile_ids_equal(dict1, dict2): + assert len(dict1) == len(dict2) + for k, v in dict1.items(): + assert k in dict2.keys() + assert_equal(dict1[k], dict2[k]) + diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index a45f71bbc3b..cd66ad82356 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -15,7 +15,7 @@ from xarray.core import utils from xarray.core.indexing import ExplicitlyIndexed from xarray.testing import (assert_equal, assert_identical, # noqa: F401 - assert_allclose) + assert_allclose, assert_combined_tile_ids_equal) from xarray.plot.utils import import_seaborn try: diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 7edd00dced5..3ec602cb6f9 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -9,11 +9,13 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems -from xarray.core.combine import _rest_of_tile_id, _concat_all_along_last_dim +from xarray.core.combine import ( + _tile_id_except_first_element, _concat_all_along_first_dim, + _infer_tile_ids_from_nested_list) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, - raises_regex, requires_dask) + assert_combined_tile_ids_equal, raises_regex, requires_dask) from .test_dataset import create_test_data @@ -400,6 +402,55 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) +class TestTileIDsFromNestedList(object): + # TODO test ordering is correct by seeding datasets differently + def test_1d(self): + ds = create_test_data(0) + input = [ds, ds] + + expected = {(0,): ds, (1,): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + + def test_2d(self): + ds = create_test_data(0) + input = [[ds, ds], [ds, ds], [ds, ds]] + + expected = {(0, 0): ds, (0, 1): ds, + (1, 0): ds, (1, 1): ds, + (2, 0): ds, (2, 1): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + + def test_3d(self): + ds = create_test_data(0) + input = [[[ds, ds], [ds, ds], [ds, ds]], + [[ds, ds], [ds, ds], [ds, ds]]] + + expected = {(0, 0, 0): ds, (0, 0, 1): ds, + (0, 1, 0): ds, (0, 1, 1): ds, + (0, 2, 0): ds, (0, 2, 1): ds, + (1, 0, 0): ds, (1, 0, 1): ds, + (1, 1, 0): ds, (1, 1, 1): ds, + (1, 2, 0): ds, (1, 2, 1): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + + def test_redundant_nesting_gotcha(self): + ds = create_test_data(0) + input = [[ds], [ds]] + + expected = {(0,): ds, (1,): ds} + with pytest.raises(TypeError): + actual = _infer_tile_ids_from_nested_list(input, [], {}) + + def test_bad_element(self): + ds = create_test_data() + input = [ds, 'bad_element'] + with pytest.raises(TypeError): + _infer_tile_ids_from_nested_list(input, [], {}) + + @pytest.fixture(scope='module') def create_combined_ids(): return _create_combined_ids @@ -423,16 +474,21 @@ def test_get_tile_ids(self, create_combined_ids): for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): expected_new_tile_id = tile_id[1:] - assert _rest_of_tile_id(combined) == expected_new_tile_id + assert _tile_id_except_first_element(combined) == expected_new_tile_id def test_concat_once(self, create_combined_ids): shape = (2,) combined_ids = _create_combined_ids(shape) print(combined_ids) - result = _concat_all_along_last_dim(combined_ids, 'dim1') + result = _concat_all_along_first_dim(combined_ids, 'dim1') print('-------------------') print(result[()]) expected_ds = concat([create_test_data(0), create_test_data(0)], 'dim1') print(expected_ds) assert_equal(result[()], expected_ds) + + @pytest.mark.skip + def test_concat_twice(self, create_combined_ids): + shape = (2, 3) + combined_ids = _create_combined_ids(shape) \ No newline at end of file From dbb371d7209011180d4621b77e93e8b1d70452da Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 7 Nov 2018 11:41:42 +0000 Subject: [PATCH 03/96] Wrote function to check that combined_tile_ids structure is valid --- xarray/core/combine.py | 23 ++++++++++++++--- xarray/tests/test_combine.py | 48 +++++++++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index d6e2aef6456..37cdc3e7f12 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -392,7 +392,7 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): # Check if list is redundant if len(entry) == 1: raise TypeError('Redundant list nesting at ' - 'position ' + str(current_pos)) + 'position ' + str(tuple(current_pos))) # Dive down tree current_pos.append(0) @@ -410,12 +410,29 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): return combined_tile_ids else: - raise TypeError("Element at position " + str(current_pos) + + raise TypeError("Element at position " + str(tuple(current_pos)) + " is neither a list nor an xarray.Dataset") def _check_shape_tile_ids(combined_tile_ids): - ... + # TODO create custom exception class instead of using asserts? + + tile_ids = combined_tile_ids.keys() + + # Check all tuples are the same length + lengths = [len(id) for id in tile_ids] + assert set(lengths) == {lengths[0]} + + # Check each dimension has indices 0 to n represented with no gaps + for dim in range(lengths[0]): + indices = [id[dim] for id in tile_ids] + assert len(indices) > 1 + assert sorted(indices) == range(max(indices)) + + # Check only datasets are contained + from .dataset import Dataset + for v in combined_tile_ids.values(): + assert isinstance(v, Dataset) def _concat_nd(combined_IDs, concat_dims): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 3ec602cb6f9..91742864436 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -11,7 +11,7 @@ from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( _tile_id_except_first_element, _concat_all_along_first_dim, - _infer_tile_ids_from_nested_list) + _infer_tile_ids_from_nested_list, _check_shape_tile_ids) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -402,6 +402,14 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) +# TODO get this fixture to work!! +@pytest.fixture(scope='module') +def ds(): + def _create_test_data(s=0): + return create_test_data(seed=s) + return _create_test_data + + class TestTileIDsFromNestedList(object): # TODO test ordering is correct by seeding datasets differently def test_1d(self): @@ -440,9 +448,8 @@ def test_redundant_nesting_gotcha(self): ds = create_test_data(0) input = [[ds], [ds]] - expected = {(0,): ds, (1,): ds} with pytest.raises(TypeError): - actual = _infer_tile_ids_from_nested_list(input, [], {}) + _infer_tile_ids_from_nested_list(input, [], {}) def test_bad_element(self): ds = create_test_data() @@ -450,6 +457,14 @@ def test_bad_element(self): with pytest.raises(TypeError): _infer_tile_ids_from_nested_list(input, [], {}) + def test_ragged_input(self): + ds = create_test_data(0) + input = [ds, [ds, ds]] + + expected = {(0,): ds, (1, 0): ds, (1, 1): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + @pytest.fixture(scope='module') def create_combined_ids(): @@ -491,4 +506,29 @@ def test_concat_once(self, create_combined_ids): @pytest.mark.skip def test_concat_twice(self, create_combined_ids): shape = (2, 3) - combined_ids = _create_combined_ids(shape) \ No newline at end of file + combined_ids = _create_combined_ids(shape) + + +class TestCheckShapeTileIDs(object): + def test_check_lengths(self): + ds = create_test_data(0) + combined_tile_ids = {(0,): ds, (0, 1): ds} + with pytest.raises(AssertionError): + _check_shape_tile_ids(combined_tile_ids) + + def test_check_non_zero_length_along_all_dims(self): + ds = create_test_data(0) + combined_tile_ids = {(0, 0): ds, (1, 0): ds} + with pytest.raises(AssertionError): + _check_shape_tile_ids(combined_tile_ids) + + def test_check_linearity(self): + ds = create_test_data(0) + combined_tile_ids = {(0,): ds, (2,): ds} + with pytest.raises(AssertionError): + _check_shape_tile_ids(combined_tile_ids) + + def test_check_contains_datasets(self): + combined_tile_ids = {(0,): 'a', (1,): 'b'} + with pytest.raises(AssertionError): + _check_shape_tile_ids(combined_tile_ids) From cc4d7438320d67931d5e5f4d655922d3f129ee2c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 7 Nov 2018 16:20:07 +0000 Subject: [PATCH 04/96] Added test of 2d-concatenation --- xarray/core/combine.py | 10 +++++++--- xarray/tests/test_combine.py | 26 ++++++++++++++------------ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 37cdc3e7f12..4fa98a70858 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -457,19 +457,23 @@ def _concat_nd(combined_IDs, concat_dims): for dim in concat_dims: combined_IDs = _concat_all_along_first_dim(combined_IDs, dim) - return combined_IDs.item + combined_ds = combined_IDs[()] + + return combined_ds def _concat_all_along_first_dim(combined_IDs, dim): - grouped = itertoolz.groupby(_tile_id_except_first_element, combined_IDs.items()) + grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) new_combined_IDs = {} + + # TODO Would there be any point in parallelizing this concatenation step? for new_ID, group in grouped.items(): to_concat = [ds for old_ID, ds in group] new_combined_IDs[new_ID] = concat(to_concat, dim) return new_combined_IDs -def _tile_id_except_first_element(single_id_ds_pair): +def _new_tile_id(single_id_ds_pair): # probably replace with something like lambda x: x[0][1:] tile_id, ds = single_id_ds_pair return tile_id[1:] diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 91742864436..392df28ddd2 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -10,8 +10,8 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( - _tile_id_except_first_element, _concat_all_along_first_dim, - _infer_tile_ids_from_nested_list, _check_shape_tile_ids) + _new_tile_id, _concat_all_along_first_dim, + _infer_tile_ids_from_nested_list, _check_shape_tile_ids, _concat_nd) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -485,28 +485,30 @@ class TestConcatND(object): def test_get_tile_ids(self, create_combined_ids): shape = (1, 2, 3) combined_ids = _create_combined_ids(shape) - print(combined_ids.keys()) for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): expected_new_tile_id = tile_id[1:] - assert _tile_id_except_first_element(combined) == expected_new_tile_id + assert _new_tile_id(combined) == expected_new_tile_id def test_concat_once(self, create_combined_ids): shape = (2,) combined_ids = _create_combined_ids(shape) - print(combined_ids) - + ds = create_test_data(0) result = _concat_all_along_first_dim(combined_ids, 'dim1') - print('-------------------') - print(result[()]) - expected_ds = concat([create_test_data(0), create_test_data(0)], 'dim1') - print(expected_ds) - assert_equal(result[()], expected_ds) - @pytest.mark.skip + expected_ds = concat([ds, ds], 'dim1') + assert_combined_tile_ids_equal(result, {(): expected_ds}) + def test_concat_twice(self, create_combined_ids): shape = (2, 3) combined_ids = _create_combined_ids(shape) + result = _concat_nd(combined_ids, concat_dims=['dim1', 'dim2']) + + ds = create_test_data(0) + partway = concat([ds, ds], dim='dim1') + expected = concat([partway, partway, partway], dim='dim2') + + assert_equal(result, expected) class TestCheckShapeTileIDs(object): From d2fc7e723fe4b99032f9ac4445fb3fc4404b4477 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 8 Nov 2018 08:53:11 +0000 Subject: [PATCH 05/96] Tests now check that dataset ordering is correct --- xarray/tests/test_combine.py | 68 ++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 392df28ddd2..f7372165eb2 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -402,45 +402,36 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) -# TODO get this fixture to work!! -@pytest.fixture(scope='module') -def ds(): - def _create_test_data(s=0): - return create_test_data(seed=s) - return _create_test_data - - class TestTileIDsFromNestedList(object): - # TODO test ordering is correct by seeding datasets differently def test_1d(self): - ds = create_test_data(0) - input = [ds, ds] + ds = create_test_data + input = [ds(0), ds(1)] - expected = {(0,): ds, (1,): ds} + expected = {(0,): ds(0), (1,): ds(1)} actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) def test_2d(self): - ds = create_test_data(0) - input = [[ds, ds], [ds, ds], [ds, ds]] + ds = create_test_data + input = [[ds(0), ds(1)], [ds(2), ds(3)], [ds(4), ds(5)]] - expected = {(0, 0): ds, (0, 1): ds, - (1, 0): ds, (1, 1): ds, - (2, 0): ds, (2, 1): ds} + expected = {(0, 0): ds(0), (0, 1): ds(1), + (1, 0): ds(2), (1, 1): ds(3), + (2, 0): ds(4), (2, 1): ds(5)} actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) def test_3d(self): - ds = create_test_data(0) - input = [[[ds, ds], [ds, ds], [ds, ds]], - [[ds, ds], [ds, ds], [ds, ds]]] - - expected = {(0, 0, 0): ds, (0, 0, 1): ds, - (0, 1, 0): ds, (0, 1, 1): ds, - (0, 2, 0): ds, (0, 2, 1): ds, - (1, 0, 0): ds, (1, 0, 1): ds, - (1, 1, 0): ds, (1, 1, 1): ds, - (1, 2, 0): ds, (1, 2, 1): ds} + ds = create_test_data + input = [[[ds(0), ds(1)], [ds(2), ds(3)], [ds(4), ds(5)]], + [[ds(6), ds(7)], [ds(8), ds(9)], [ds(10), ds(11)]]] + + expected = {(0, 0, 0): ds(0), (0, 0, 1): ds(1), + (0, 1, 0): ds(2), (0, 1, 1): ds(3), + (0, 2, 0): ds(4), (0, 2, 1): ds(5), + (1, 0, 0): ds(6), (1, 0, 1): ds(7), + (1, 1, 0): ds(8), (1, 1, 1): ds(9), + (1, 2, 0): ds(10), (1, 2, 1): ds(11)} actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) @@ -452,16 +443,16 @@ def test_redundant_nesting_gotcha(self): _infer_tile_ids_from_nested_list(input, [], {}) def test_bad_element(self): - ds = create_test_data() + ds = create_test_data(0) input = [ds, 'bad_element'] with pytest.raises(TypeError): _infer_tile_ids_from_nested_list(input, [], {}) def test_ragged_input(self): - ds = create_test_data(0) - input = [ds, [ds, ds]] + ds = create_test_data + input = [ds(0), [ds(1), ds(2)]] - expected = {(0,): ds, (1, 0): ds, (1, 1): ds} + expected = {(0,): ds(0), (1, 0): ds(1), (1, 1): ds(2)} actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) @@ -473,7 +464,8 @@ def create_combined_ids(): def _create_combined_ids(shape): tile_ids = _create_tile_ids(shape) - return {tile_id: create_test_data(0) for tile_id in tile_ids} + nums = range(len(tile_ids)) + return {tile_id: create_test_data(num) for tile_id, num in zip(tile_ids, nums)} def _create_tile_ids(shape): @@ -493,10 +485,10 @@ def test_get_tile_ids(self, create_combined_ids): def test_concat_once(self, create_combined_ids): shape = (2,) combined_ids = _create_combined_ids(shape) - ds = create_test_data(0) + ds = create_test_data result = _concat_all_along_first_dim(combined_ids, 'dim1') - expected_ds = concat([ds, ds], 'dim1') + expected_ds = concat([ds(0), ds(1)], 'dim1') assert_combined_tile_ids_equal(result, {(): expected_ds}) def test_concat_twice(self, create_combined_ids): @@ -504,9 +496,11 @@ def test_concat_twice(self, create_combined_ids): combined_ids = _create_combined_ids(shape) result = _concat_nd(combined_ids, concat_dims=['dim1', 'dim2']) - ds = create_test_data(0) - partway = concat([ds, ds], dim='dim1') - expected = concat([partway, partway, partway], dim='dim2') + ds = create_test_data + partway1 = concat([ds(0), ds(3)], dim='dim1') + partway2 = concat([ds(1), ds(4)], dim='dim1') + partway3 = concat([ds(2), ds(5)], dim='dim1') + expected = concat([partway1, partway2, partway3], dim='dim2') assert_equal(result, expected) From e3f3699a76d2ed2f88605ddd0cc2b46d6f89baa4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 8 Nov 2018 09:11:17 +0000 Subject: [PATCH 06/96] Test concatentation along a new dimension --- xarray/tests/test_combine.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index f7372165eb2..f171f998a7e 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -482,13 +482,14 @@ def test_get_tile_ids(self, create_combined_ids): expected_new_tile_id = tile_id[1:] assert _new_tile_id(combined) == expected_new_tile_id - def test_concat_once(self, create_combined_ids): + @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) + def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) combined_ids = _create_combined_ids(shape) ds = create_test_data - result = _concat_all_along_first_dim(combined_ids, 'dim1') + result = _concat_all_along_first_dim(combined_ids, dim=concat_dim) - expected_ds = concat([ds(0), ds(1)], 'dim1') + expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) def test_concat_twice(self, create_combined_ids): From 55bf6853bb41e5c8de1da749859acd9e9aa1e0b9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 9 Nov 2018 09:14:32 +0000 Subject: [PATCH 07/96] Started generalising auto_combine to N-D by integrating the N-D concatentation algorithm --- xarray/core/combine.py | 131 ++++++++++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 34 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 4fa98a70858..b7de0d15e0d 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -367,6 +367,30 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) +_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' + + +def _infer_concat_order_from_nested_list(datasets, concat_dims): + + # TODO check that datasets is a list containing multiple elements + + combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {}) + + # Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call + # TODO would be faster in this case to just work out the concat_dims once here + tile_id, ds = combined_ids[0] + n_dims = len(tile_id) + if concat_dims is None: + concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims + else: + if len(concat_dims) != n_dims: + raise ValueError("concat_dims is of length " + str(len(concat_dims)) + + " but the datasets passed are nested in a " + + str(n_dims) + "-dimensional structure") + + return concat_dims, combined_ids + + def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): """ Given a list of lists (of lists...) of datasets, returns a dictionary @@ -435,9 +459,15 @@ def _check_shape_tile_ids(combined_tile_ids): assert isinstance(v, Dataset) -def _concat_nd(combined_IDs, concat_dims): +def _data_vars(combined_id): + id, ds = combined_id + return tuple(sorted(ds.data_vars)) + + +def _combine_nd(combined_IDs, concat_dims, data_vars='all', + coords='different', compat='no_conflicts'): """ - Recursively concatenates an N-dimensional structure of datasets. + Concatenates and merges an N-dimensional structure of datasets. No checks are performed on the consistency of the datasets, concat_dims or tile_IDs, because it is assumed that this has already been done. @@ -446,31 +476,34 @@ def _concat_nd(combined_IDs, concat_dims): ---------- combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset] Structure containing all datasets to be concatenated with "tile_IDs" as - keys, which specify position within the desired final concatenated result. + keys, which specify position within the desired final combined result. concat_dims : sequence of str + The dimensions along which the datasets should be concatenated. Must be + in order, and the length must match Returns ------- """ - for dim in concat_dims: - combined_IDs = _concat_all_along_first_dim(combined_IDs, dim) - - combined_ds = combined_IDs[()] + # Organise by data variables + grouped_by_data_vars = itertoolz.groupby(_data_vars, + combined_IDs.items()).values() + concatenated_datasets = [] + for tiled_datasets in grouped_by_data_vars: + concatenated_ids = tiled_datasets - return combined_ds + # Perform N-D dimensional concatenation + for concat_dim in concat_dims: + dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + concatenated_ids = _concat_along_first_dim(concatenated_ids, + dim=dim, + data_vars=data_vars, + coords=coords) + concatenated_datasets.append(concatenated_ids.values()) -def _concat_all_along_first_dim(combined_IDs, dim): - grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) - new_combined_IDs = {} - - # TODO Would there be any point in parallelizing this concatenation step? - for new_ID, group in grouped.items(): - to_concat = [ds for old_ID, ds in group] - new_combined_IDs[new_ID] = concat(to_concat, dim) - return new_combined_IDs + return merge(concatenated_datasets, compat=compat) def _new_tile_id(single_id_ds_pair): @@ -479,13 +512,25 @@ def _new_tile_id(single_id_ds_pair): return tile_id[1:] -_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' +def _concat_along_first_dim(combined_IDs, dim, data_vars='all', + coords='different'): + grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) + new_combined_IDs = {} + + # TODO Would there be any point in parallelizing this concatenation step? + for new_ID, group in grouped.items(): + to_concat = [ds for old_ID, ds in group] + new_combined_IDs[new_ID] = _auto_concat(to_concat, dim=dim, + data_vars=data_vars, + coords=coords) + return new_combined_IDs def auto_combine(datasets, - concat_dim=_CONCAT_DIM_DEFAULT, + concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', - data_vars='all', coords='different'): + data_vars='all', coords='different', + infer_order_from_coords=True): """Attempt to auto-magically combine the given datasets into one. This method attempts to combine a list of datasets into a single entity by @@ -504,10 +549,10 @@ def auto_combine(datasets, ---------- datasets : sequence of xarray.Dataset Dataset objects to merge. - concat_dim : str or DataArray or Index, optional - Dimension along which to concatenate variables, as used by + concat_dims : list of str or DataArray or Index, optional + Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. You only need to provide this argument if - the dimension along which you want to concatenate is not a dimension + the dimensions along which you want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining @@ -528,8 +573,14 @@ def auto_combine(datasets, of all non-null values. data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat - coords : {'minimal', 'different', 'all' o list of str}, optional + coords : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat + infer_order_from_coords : bool, optional + If true attempt to deduce the order in which the datasets should be + concatenated from their coordinates. To do this the coordinates should + be monotonic along the dimension to be concatenated. + If false instead read the order from the structure the datasets are + supplied in. This structure should be a nested list of lists. Returns ------- @@ -540,15 +591,27 @@ def auto_combine(datasets, concat Dataset.merge """ - from toolz import itertoolz - if concat_dim is not None: - dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), - datasets).values() - concatenated = [_auto_concat(ds, dim=dim, - data_vars=data_vars, coords=coords) - for ds in grouped] + if concat_dims is not None: + + # TODO this could be where we would optionally check alignment, as in #2039 + + # Organise datasets in concatentation order in N-D + if infer_order_from_coords: + # TODO Use coordinates to determine tile_ID for each dataset in N-D + # i.e. (shoyer's (1) from discussion in #2159) + raise NotImplementedError + else: + # Determine tile_IDs by structure of input in N-D (i.e. ordering in list-of-lists) + concat_dims, combined_ids = _infer_concat_order_from_nested_list(datasets, concat_dims) + + # Check that the combined_ids are sensible + _check_shape_tile_ids(combined_ids) + + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords) else: + # Case of no concatenation wanted concatenated = datasets - merged = merge(concatenated, compat=compat) - return merged + combined = merge(concatenated, compat=compat) + return combined From 845206c96da16e2dd44e04f2b76450904c17965f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 9 Nov 2018 18:58:07 +0000 Subject: [PATCH 08/96] All unit tests now passing --- xarray/backends/api.py | 6 +- xarray/core/combine.py | 87 +++++++++++++-------------- xarray/tests/test_backends.py | 4 +- xarray/tests/test_combine.py | 107 ++++++++++++++++++++++++---------- 4 files changed, 120 insertions(+), 84 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ca440872d73..d3c5efb6ec2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -480,7 +480,7 @@ def close(self): _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' -def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, +def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', autoclose=None, parallel=False, **kwargs): @@ -620,11 +620,11 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, # close datasets in case of a ValueError try: - if concat_dim is _CONCAT_DIM_DEFAULT: + if concat_dims is _CONCAT_DIM_DEFAULT: combined = auto_combine(datasets, compat=compat, data_vars=data_vars, coords=coords) else: - combined = auto_combine(datasets, concat_dim=concat_dim, + combined = auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords) except ValueError: diff --git a/xarray/core/combine.py b/xarray/core/combine.py index b7de0d15e0d..67b514c6e3c 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -372,23 +372,21 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): def _infer_concat_order_from_nested_list(datasets, concat_dims): - # TODO check that datasets is a list containing multiple elements - combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {}) # Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call - # TODO would be faster in this case to just work out the concat_dims once here - tile_id, ds = combined_ids[0] + # TODO might be faster in this case to just work out the concat_dims once here + tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims is None: + if concat_dims is None or concat_dims == _CONCAT_DIM_DEFAULT: concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims else: if len(concat_dims) != n_dims: - raise ValueError("concat_dims is of length " + str(len(concat_dims)) + raise ValueError("concat_dims has length " + str(len(concat_dims)) + " but the datasets passed are nested in a " + str(n_dims) + "-dimensional structure") - return concat_dims, combined_ids + return combined_ids, concat_dims def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): @@ -396,8 +394,8 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): Given a list of lists (of lists...) of datasets, returns a dictionary with the index of each dataset in the nested list structure as the key. - Recursively traverses the given structure, while keeping track of the current - position. + Recursively traverses the given structure, while keeping track of the + current position. Parameters ---------- @@ -413,17 +411,13 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): from .dataset import Dataset if isinstance(entry, list): - # Check if list is redundant - if len(entry) == 1: - raise TypeError('Redundant list nesting at ' - 'position ' + str(tuple(current_pos))) - - # Dive down tree + # Dive down tree and recursively open the next list current_pos.append(0) for i, item in enumerate(entry): current_pos[-1] = i - combined_tile_ids = _infer_tile_ids_from_nested_list(item, current_pos, - combined_tile_ids) + combined_tile_ids = _infer_tile_ids_from_nested_list\ + (item, current_pos, combined_tile_ids) + # Move back up tree del current_pos[-1] return combined_tile_ids @@ -435,11 +429,13 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): else: raise TypeError("Element at position " + str(tuple(current_pos)) + - " is neither a list nor an xarray.Dataset") + " is of type " + str(type(entry)) + ", which is " + "neither a list nor an xarray.Dataset") def _check_shape_tile_ids(combined_tile_ids): # TODO create custom exception class instead of using asserts? + # Is this function even necessary? tile_ids = combined_tile_ids.keys() @@ -447,12 +443,6 @@ def _check_shape_tile_ids(combined_tile_ids): lengths = [len(id) for id in tile_ids] assert set(lengths) == {lengths[0]} - # Check each dimension has indices 0 to n represented with no gaps - for dim in range(lengths[0]): - indices = [id[dim] for id in tile_ids] - assert len(indices) > 1 - assert sorted(indices) == range(max(indices)) - # Check only datasets are contained from .dataset import Dataset for v in combined_tile_ids.values(): @@ -489,11 +479,16 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', # Organise by data variables grouped_by_data_vars = itertoolz.groupby(_data_vars, combined_IDs.items()).values() + concatenated_datasets = [] - for tiled_datasets in grouped_by_data_vars: - concatenated_ids = tiled_datasets + for tiled_datasets_group in grouped_by_data_vars: + + # Convert list of tuples back into a dictionary + concatenated_ids = dict(tiled_datasets_group) # Perform N-D dimensional concatenation + # Each iteration of this loop reduces the length of the tile_IDs tuples + # by one. It always removes the first for concat_dim in concat_dims: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim @@ -501,13 +496,13 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', dim=dim, data_vars=data_vars, coords=coords) - concatenated_datasets.append(concatenated_ids.values()) - + concatenated_datasets = concatenated_datasets \ + + list(concatenated_ids.values()) return merge(concatenated_datasets, compat=compat) def _new_tile_id(single_id_ds_pair): - # probably replace with something like lambda x: x[0][1:] + # TODO maybe replace with something like lambda x: x[0][1:]? tile_id, ds = single_id_ds_pair return tile_id[1:] @@ -530,20 +525,12 @@ def auto_combine(datasets, concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', data_vars='all', coords='different', - infer_order_from_coords=True): + infer_order_from_coords=False): """Attempt to auto-magically combine the given datasets into one. - This method attempts to combine a list of datasets into a single entity by - inspecting metadata and using a combination of concat and merge. - - It does not concatenate along more than one dimension or sort data under - any circumstances. It does align coordinates, but different variables on - datasets can cause it to fail under some scenarios. In complex cases, you - may need to clean up your data and use ``concat``/``merge`` explicitly. - - ``auto_combine`` works well if you have N years of data and M data - variables, and each combination of a distinct time period and set of data - variables is saved its own dataset. + This method attempts to combine a list (or nested list of lists) of + datasets into a single entity by inspecting metadata and using a + combination of concat and merge. Parameters ---------- @@ -552,9 +539,9 @@ def auto_combine(datasets, concat_dims : list of str or DataArray or Index, optional Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. You only need to provide this argument if - the dimensions along which you want to concatenate is not a dimension - in the original datasets, e.g., if you want to stack a collection of - 2D arrays along a third dimension. + any of the dimensions along which you want to concatenate are not a + dimension in the original datasets, e.g., if you want to stack a + collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=None`` explicitly to disable concatenation. @@ -591,18 +578,24 @@ def auto_combine(datasets, concat Dataset.merge """ + + # TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here? + if concat_dims is not None: - # TODO this could be where we would optionally check alignment, as in #2039 + # TODO this could be where we would optionally check alignment, as in #2039? # Organise datasets in concatentation order in N-D if infer_order_from_coords: # TODO Use coordinates to determine tile_ID for each dataset in N-D # i.e. (shoyer's (1) from discussion in #2159) raise NotImplementedError + # Once this is implemented I think it should be the default else: - # Determine tile_IDs by structure of input in N-D (i.e. ordering in list-of-lists) - concat_dims, combined_ids = _infer_concat_order_from_nested_list(datasets, concat_dims) + # Determine tile_IDs by structure of input in N-D + # (i.e. ordering in list-of-lists) + combined_ids, concat_dims = _infer_concat_order_from_nested_list\ + (datasets, concat_dims) # Check that the combined_ids are sensible _check_shape_tile_ids(combined_ids) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fb9c43c0165..0edc7db4980 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2234,7 +2234,7 @@ def test_open_mfdataset_concat_dim_none(self): data = Dataset({'x': 0}) data.to_netcdf(tmp1) Dataset({'x': np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual: + with open_mfdataset([tmp1, tmp2], concat_dims=None) as actual: assert_identical(data, actual) def test_open_dataset(self): @@ -2261,7 +2261,7 @@ def test_open_single_dataset(self): {'baz': [100]}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset([tmp], concat_dim=dim) as actual: + with open_mfdataset([tmp], concat_dims=[dim]) as actual: assert_identical(expected, actual) def test_dask_roundtrip(self): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index f171f998a7e..8b6912e98d5 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -10,8 +10,9 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( - _new_tile_id, _concat_all_along_first_dim, - _infer_tile_ids_from_nested_list, _check_shape_tile_ids, _concat_nd) + _new_tile_id, _concat_along_first_dim, + _infer_concat_order_from_nested_list, _infer_tile_ids_from_nested_list, + _check_shape_tile_ids, _combine_nd) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -308,6 +309,9 @@ def test_auto_combine(self): expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) + actual = auto_combine(actual) + assert_identical(expected, actual) + actual = auto_combine([actual]) assert_identical(expected, actual) @@ -354,7 +358,7 @@ def test_auto_combine_previously_failed(self): expected = Dataset({'a': (('t', 'x'), [[np.nan, 2, 3], [1, 2, np.nan]])}, {'x': [0, 1, 2]}) - actual = auto_combine(datasets, concat_dim='t') + actual = auto_combine(datasets, concat_dims=['t']) assert_identical(expected, actual) @requires_dask # only for toolz @@ -379,14 +383,14 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) data = Dataset({'x': 0}) - actual = auto_combine([data, data, data], concat_dim=None) + actual = auto_combine([data, data, data], concat_dims=None) assert_identical(data, actual) # Single object, with a concat_dim explicitly provided # Test the issue reported in GH #1988 objs = [Dataset({'x': 0, 'y': 1})] dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dim=dim) + actual = auto_combine(objs, concat_dims=[dim]) expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, {'baz': [100]}) assert_identical(expected, actual) @@ -395,7 +399,7 @@ def test_auto_combine_no_concat(self): # expected for non-scalar values, too. objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dim=dim) + actual = auto_combine(objs, concat_dims=[dim]) expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), 'y': (('baz', 'z'), [[1, 2]])}, {'baz': [100]}) @@ -435,20 +439,41 @@ def test_3d(self): actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) - def test_redundant_nesting_gotcha(self): + def test_single_dataset(self): ds = create_test_data(0) - input = [[ds], [ds]] + input = [ds] - with pytest.raises(TypeError): - _infer_tile_ids_from_nested_list(input, [], {}) + expected = {(0,): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) - def test_bad_element(self): + def test_redundant_nesting(self): + ds = create_test_data + input = [[ds(0)], [ds(1)]] + + expected = {(0, 0): ds(0), (1, 0): ds(1)} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + + @pytest.mark.parametrize("bad_element", ['a', 2]) + def test_bad_element(self, bad_element): ds = create_test_data(0) - input = [ds, 'bad_element'] - with pytest.raises(TypeError): + input = [ds, bad_element] + with raises_regex(TypeError, 'Element at position .* is of type .*, ' + 'which is neither a list nor an ' + 'xarray.Dataset'): _infer_tile_ids_from_nested_list(input, [], {}) + def test_ignore_empty_list(self): + ds = create_test_data(0) + input = [ds, []] + expected = {(0,): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + def test_ragged_input(self): + # Auto_combine won't work on ragged input + # but this is just to increase test coverage ds = create_test_data input = [ds(0), [ds(1), ds(2)]] @@ -456,6 +481,15 @@ def test_ragged_input(self): actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) + def test_infer_from_datasets(self): + ds = create_test_data + input = [ds(0), ds(1)] + + expected = {(0,): ds(0), (1,): ds(1)} + actual, concat_dims = _infer_concat_order_from_nested_list\ + (input, ['dim1']) + assert_combined_tile_ids_equal(expected, actual) + @pytest.fixture(scope='module') def create_combined_ids(): @@ -473,10 +507,10 @@ def _create_tile_ids(shape): return list(tile_ids) -class TestConcatND(object): - def test_get_tile_ids(self, create_combined_ids): +class TestCombineND(object): + def test_get_new_tile_ids(self, create_combined_ids): shape = (1, 2, 3) - combined_ids = _create_combined_ids(shape) + combined_ids = create_combined_ids(shape) for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): expected_new_tile_id = tile_id[1:] @@ -485,17 +519,17 @@ def test_get_tile_ids(self, create_combined_ids): @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) - combined_ids = _create_combined_ids(shape) + combined_ids = create_combined_ids(shape) ds = create_test_data - result = _concat_all_along_first_dim(combined_ids, dim=concat_dim) + result = _concat_along_first_dim(combined_ids, dim=concat_dim) expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) def test_concat_twice(self, create_combined_ids): shape = (2, 3) - combined_ids = _create_combined_ids(shape) - result = _concat_nd(combined_ids, concat_dims=['dim1', 'dim2']) + combined_ids = create_combined_ids(shape) + result = _combine_nd(combined_ids, concat_dims=['dim1', 'dim2']) ds = create_test_data partway1 = concat([ds(0), ds(3)], dim='dim1') @@ -513,19 +547,28 @@ def test_check_lengths(self): with pytest.raises(AssertionError): _check_shape_tile_ids(combined_tile_ids) - def test_check_non_zero_length_along_all_dims(self): - ds = create_test_data(0) - combined_tile_ids = {(0, 0): ds, (1, 0): ds} - with pytest.raises(AssertionError): - _check_shape_tile_ids(combined_tile_ids) - - def test_check_linearity(self): - ds = create_test_data(0) - combined_tile_ids = {(0,): ds, (2,): ds} - with pytest.raises(AssertionError): - _check_shape_tile_ids(combined_tile_ids) - def test_check_contains_datasets(self): combined_tile_ids = {(0,): 'a', (1,): 'b'} with pytest.raises(AssertionError): _check_shape_tile_ids(combined_tile_ids) + + +class TestAutoCombineND(object): + # TODO there should be a lot more tests in here testing different cases + + def test_auto_combine_2d(self): + ds = create_test_data + + partway1 = concat([ds(0), ds(3)], dim='dim1') + partway2 = concat([ds(1), ds(4)], dim='dim1') + partway3 = concat([ds(2), ds(5)], dim='dim1') + expected = concat([partway1, partway2, partway3], dim='dim2') + + datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] + result = auto_combine(datasets, concat_dims=['dim1', 'dim2']) + + assert_equal(result, expected) + + def test_ragged_input(self): + # TODO should throw an informative error if you try this + ... From f4e9aad81f154d5edbc0fc422678d6fa9025c0cf Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 10 Nov 2018 12:53:12 +0000 Subject: [PATCH 09/96] Fixed a failing test which I didn't notice because I don't have pseudoNetCDF --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0edc7db4980..d9109c4b67f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2625,7 +2625,7 @@ def test_uamiv_format_mfread(self): ['example.uamiv', 'example.uamiv'], engine='pseudonetcdf', - concat_dim='TSTEP', + concat_dims=['TSTEP'], backend_kwargs={'format': 'uamiv'}) data1 = np.arange(20, dtype='f').reshape(1, 1, 4, 5) From 00004a14304d461c8784fca75a4c848cccbc7011 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 12:44:00 +0000 Subject: [PATCH 10/96] Began updating open_mfdataset to handle N-D input --- xarray/backends/api.py | 44 ++++++++++++++++++++++++++++++++++-------- xarray/core/combine.py | 11 ++++++----- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d3c5efb6ec2..89aa989624b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,7 +10,8 @@ from .. import Dataset, backends, conventions from ..core import indexing -from ..core.combine import auto_combine +from ..core.combine import (_infer_concat_order_from_positions, _combine_nd,\ + _check_shape_tile_ids, merge) from ..core.pycompat import basestring, path_type from ..core.utils import close_on_error, is_remote_uri, is_grib_path from .common import ArrayWriter @@ -483,6 +484,7 @@ def close(self): def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', + infer_order_from_coords=False, autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. @@ -502,7 +504,7 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details [2]. - concat_dim : None, str, DataArray or Index, optional + concat_dims : None, str, DataArray or Index, optional Dimension to concatenate files along. This argument is passed on to :py:func:`xarray.auto_combine` along with the dataset objects. You only need to provide this argument if the dimension along which you want to @@ -561,6 +563,12 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition the 'minimal' coordinates. + infer_order_from_coords : bool, optional + If true attempt to deduce the order in which the datasets should be + concatenated from their coordinates. To do this the coordinates should + be monotonic along the dimension to be concatenated. + If false instead read the order from the structure the datasets are + supplied in. This structure should be a nested list of lists. parallel : bool, optional If True, the open and preprocess steps of this function will be performed in parallel using ``dask.delayed``. Default is False. @@ -594,6 +602,11 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') + # If infer_order_from_coords=True then this is uneccessary, but quick as + # it will just loop over one list + combined_ids_paths, concat_dims = _infer_concat_order_from_positions(paths, concat_dims) # Use an OrderedDict? + ids, paths = list(combined_ids_paths.keys()), list(combined_ids_paths.values()) # Is this in order?? + open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, autoclose=autoclose, **kwargs) @@ -620,13 +633,28 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, # close datasets in case of a ValueError try: - if concat_dims is _CONCAT_DIM_DEFAULT: - combined = auto_combine(datasets, compat=compat, - data_vars=data_vars, coords=coords) + # TODO refactor this section to avoid duplicating any logic with auto_combine + if concat_dims is not None: + # Arrange datasets for concatenation + if infer_order_from_coords: + # Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + raise NotImplementedError + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) + else: + # Already sorted so just use the ids already determined from the input shape + combined_ids = dict(zip(ids, datasets)) + + # Check that the combined_ids are sensible + _check_shape_tile_ids(combined_ids) + + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords) else: - combined = auto_combine(datasets, concat_dims=concat_dims, - compat=compat, - data_vars=data_vars, coords=coords) + # Case of no concatenation wanted + concatenated = datasets + combined = merge(concatenated, compat=compat) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 67b514c6e3c..a05d0ac31ee 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -370,7 +370,7 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' -def _infer_concat_order_from_nested_list(datasets, concat_dims): +def _infer_concat_order_from_positions(datasets, concat_dims): combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {}) @@ -585,24 +585,25 @@ def auto_combine(datasets, # TODO this could be where we would optionally check alignment, as in #2039? - # Organise datasets in concatentation order in N-D + # Arrange datasets for concatenation if infer_order_from_coords: # TODO Use coordinates to determine tile_ID for each dataset in N-D # i.e. (shoyer's (1) from discussion in #2159) raise NotImplementedError # Once this is implemented I think it should be the default + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) else: # Determine tile_IDs by structure of input in N-D # (i.e. ordering in list-of-lists) - combined_ids, concat_dims = _infer_concat_order_from_nested_list\ - (datasets, concat_dims) + combined_ids, concat_dims = _infer_concat_order_from_positions \ + (datasets, concat_dims) # Check that the combined_ids are sensible _check_shape_tile_ids(combined_ids) # Repeatedly concatenate then merge along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords) + data_vars=data_vars, coords=coords) else: # Case of no concatenation wanted concatenated = datasets From b41e37494dea8224c544b04533222c9bfdc7f6a6 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 13:58:45 +0000 Subject: [PATCH 11/96] Refactored to remove duplicate logic in open_mfdataset & auto_combine --- xarray/backends/api.py | 42 +++++++++--------------- xarray/core/combine.py | 73 +++++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 56 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 89aa989624b..672e43e9e58 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,8 +10,7 @@ from .. import Dataset, backends, conventions from ..core import indexing -from ..core.combine import (_infer_concat_order_from_positions, _combine_nd,\ - _check_shape_tile_ids, merge) +from ..core.combine import _infer_concat_order_from_positions, _auto_combine from ..core.pycompat import basestring, path_type from ..core.utils import close_on_error, is_remote_uri, is_grib_path from .common import ArrayWriter @@ -602,8 +601,10 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') - # If infer_order_from_coords=True then this is uneccessary, but quick as - # it will just loop over one list + # If infer_order_from_coords=True then this is uneccessary, but that's fine + # as it should be quick - in this case it will just loop over one list + # If infer_order_from_coords=False then this creates a flat list which is + # easier to iterate over, while saving the originally-supplied structure combined_ids_paths, concat_dims = _infer_concat_order_from_positions(paths, concat_dims) # Use an OrderedDict? ids, paths = list(combined_ids_paths.keys()), list(combined_ids_paths.values()) # Is this in order?? @@ -631,30 +632,17 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, # the underlying datasets will still be stored as dask arrays datasets, file_objs = dask.compute(datasets, file_objs) - # close datasets in case of a ValueError + # Close datasets in case of a ValueError try: - # TODO refactor this section to avoid duplicating any logic with auto_combine - if concat_dims is not None: - # Arrange datasets for concatenation - if infer_order_from_coords: - # Use coordinates to determine tile_ID for each dataset in N-D - # Ignore how they were ordered previously - raise NotImplementedError - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) - else: - # Already sorted so just use the ids already determined from the input shape - combined_ids = dict(zip(ids, datasets)) - - # Check that the combined_ids are sensible - _check_shape_tile_ids(combined_ids) - - # Repeatedly concatenate then merge along each dimension - combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords) - else: - # Case of no concatenation wanted - concatenated = datasets - combined = merge(concatenated, compat=compat) + if infer_order_from_coords: + # Discard ordering because it should be redone from coordinates + ids = False + + combined = _auto_combine(datasets, concat_dims=concat_dims, + compat=compat, + data_vars=data_vars, coords=coords, + infer_order_from_coords=infer_order_from_coords, + ids=ids) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index a05d0ac31ee..caa6d805232 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -512,7 +512,6 @@ def _concat_along_first_dim(combined_IDs, dim, data_vars='all', grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) new_combined_IDs = {} - # TODO Would there be any point in parallelizing this concatenation step? for new_ID, group in grouped.items(): to_concat = [ds for old_ID, ds in group] new_combined_IDs[new_ID] = _auto_concat(to_concat, dim=dim, @@ -521,6 +520,45 @@ def _concat_along_first_dim(combined_IDs, dim, data_vars='all', return new_combined_IDs +def _auto_combine(datasets, concat_dims, compat, data_vars, coords, + infer_order_from_coords, ids): + """ + This function decides if any concatenation is necessary, and if so it calls + the logic to decide their concatenation order before concatenating. + """ + + if concat_dims is not None: + # Arrange datasets for concatenation + if infer_order_from_coords: + # Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + raise NotImplementedError + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) + else: + # Use information from the shape of the user input + if not ids: + # Determine tile_IDs by structure of input in N-D + # (i.e. ordering in list-of-lists) + combined_ids, concat_dims = _infer_concat_order_from_positions\ + (datasets, concat_dims) + else: + # Already sorted so just use the ids already passed + combined_ids = dict(zip(ids, datasets)) + + # Check that the combined_ids are sensible + _check_shape_tile_ids(combined_ids) + + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords) + else: + # Case of no concatenation wanted + concatenated = datasets + combined = merge(concatenated, compat=compat) + + return combined + + def auto_combine(datasets, concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', @@ -581,31 +619,8 @@ def auto_combine(datasets, # TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here? - if concat_dims is not None: - - # TODO this could be where we would optionally check alignment, as in #2039? - - # Arrange datasets for concatenation - if infer_order_from_coords: - # TODO Use coordinates to determine tile_ID for each dataset in N-D - # i.e. (shoyer's (1) from discussion in #2159) - raise NotImplementedError - # Once this is implemented I think it should be the default - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) - else: - # Determine tile_IDs by structure of input in N-D - # (i.e. ordering in list-of-lists) - combined_ids, concat_dims = _infer_concat_order_from_positions \ - (datasets, concat_dims) - - # Check that the combined_ids are sensible - _check_shape_tile_ids(combined_ids) - - # Repeatedly concatenate then merge along each dimension - combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords) - else: - # Case of no concatenation wanted - concatenated = datasets - combined = merge(concatenated, compat=compat) - return combined + # The IDs argument tells _auto_combine that the datasets are not sorted + return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, + data_vars=data_vars, coords=coords, + infer_order_from_coords=infer_order_from_coords, + ids=False) \ No newline at end of file From 8672a79833236a893aba85771c96b02db06008ef Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 16:32:28 +0000 Subject: [PATCH 12/96] Implemented Shoyers suggestion in #2553 to rewrite the recursive nested list traverser as an iterator --- xarray/core/combine.py | 46 +++++++++++++----------------------- xarray/tests/test_combine.py | 27 +++++++-------------- 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index caa6d805232..66ece818470 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -372,7 +372,7 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): def _infer_concat_order_from_positions(datasets, concat_dims): - combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {}) + combined_ids = dict(_infer_tile_ids_from_nested_list(datasets, ())) # Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call # TODO might be faster in this case to just work out the concat_dims once here @@ -389,48 +389,34 @@ def _infer_concat_order_from_positions(datasets, concat_dims): return combined_ids, concat_dims -def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): +def _infer_tile_ids_from_nested_list(entry, current_pos): """ - Given a list of lists (of lists...) of datasets, returns a dictionary - with the index of each dataset in the nested list structure as the key. + Given a list of lists (of lists...) of objects, returns a iterator + which returns a tuple containing the index of each object in the nested + list structure as the key, and the object. This can then be called by the + dict constructor to create a dictionary of the objects organised byt their + position in the original nested list. Recursively traverses the given structure, while keeping track of the - current position. + current position. Should work for any type of object which isn't a list. Parameters ---------- - entry : list[list[xarray.Dataset, xarray.Dataset, ...]] - List of lists of arbitrary depth, containing datasets in the order they - are to be concatenated. + entry : list[list[obj, obj, ...]] + List of lists of arbitrary depth, containing objects in the order + they are to be concatenated. Returns ------- - combined_tile_ids : dict[tuple(int, ...), xarray.Dataset] + combined_tile_ids : dict[tuple(int, ...), obj] """ - from .dataset import Dataset - if isinstance(entry, list): - # Dive down tree and recursively open the next list - current_pos.append(0) for i, item in enumerate(entry): - current_pos[-1] = i - combined_tile_ids = _infer_tile_ids_from_nested_list\ - (item, current_pos, combined_tile_ids) - - # Move back up tree - del current_pos[-1] - return combined_tile_ids - - elif isinstance(entry, Dataset): - # Termination condition - combined_tile_ids[tuple(current_pos)] = entry - return combined_tile_ids - + for result in _infer_tile_ids_from_nested_list(item, current_pos + (i,)): + yield result else: - raise TypeError("Element at position " + str(tuple(current_pos)) + - " is of type " + str(type(entry)) + ", which is " - "neither a list nor an xarray.Dataset") + yield current_pos, entry def _check_shape_tile_ids(combined_tile_ids): @@ -619,7 +605,7 @@ def auto_combine(datasets, # TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here? - # The IDs argument tells _auto_combine that the datasets are not sorted + # The IDs argument tells _auto_combine that the datasets are not yet sorted return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords, infer_order_from_coords=infer_order_from_coords, diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 8b6912e98d5..0efeaeec30b 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -11,7 +11,7 @@ from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( _new_tile_id, _concat_along_first_dim, - _infer_concat_order_from_nested_list, _infer_tile_ids_from_nested_list, + _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, _check_shape_tile_ids, _combine_nd) from . import ( @@ -412,7 +412,7 @@ def test_1d(self): input = [ds(0), ds(1)] expected = {(0,): ds(0), (1,): ds(1)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_2d(self): @@ -422,7 +422,7 @@ def test_2d(self): expected = {(0, 0): ds(0), (0, 1): ds(1), (1, 0): ds(2), (1, 1): ds(3), (2, 0): ds(4), (2, 1): ds(5)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_3d(self): @@ -436,7 +436,7 @@ def test_3d(self): (1, 0, 0): ds(6), (1, 0, 1): ds(7), (1, 1, 0): ds(8), (1, 1, 1): ds(9), (1, 2, 0): ds(10), (1, 2, 1): ds(11)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_single_dataset(self): @@ -444,7 +444,7 @@ def test_single_dataset(self): input = [ds] expected = {(0,): ds} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_redundant_nesting(self): @@ -452,23 +452,14 @@ def test_redundant_nesting(self): input = [[ds(0)], [ds(1)]] expected = {(0, 0): ds(0), (1, 0): ds(1)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) - @pytest.mark.parametrize("bad_element", ['a', 2]) - def test_bad_element(self, bad_element): - ds = create_test_data(0) - input = [ds, bad_element] - with raises_regex(TypeError, 'Element at position .* is of type .*, ' - 'which is neither a list nor an ' - 'xarray.Dataset'): - _infer_tile_ids_from_nested_list(input, [], {}) - def test_ignore_empty_list(self): ds = create_test_data(0) input = [ds, []] expected = {(0,): ds} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_ragged_input(self): @@ -478,7 +469,7 @@ def test_ragged_input(self): input = [ds(0), [ds(1), ds(2)]] expected = {(0,): ds(0), (1, 0): ds(1), (1, 1): ds(2)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_infer_from_datasets(self): @@ -486,7 +477,7 @@ def test_infer_from_datasets(self): input = [ds(0), ds(1)] expected = {(0,): ds(0), (1,): ds(1)} - actual, concat_dims = _infer_concat_order_from_nested_list\ + actual, concat_dims = _infer_concat_order_from_positions\ (input, ['dim1']) assert_combined_tile_ids_equal(expected, actual) From 4f56b240216849044d8d68c17e0ce88256393017 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 16:33:08 +0000 Subject: [PATCH 13/96] --amend --- xarray/core/combine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 66ece818470..d724a8dfdce 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -413,7 +413,8 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): if isinstance(entry, list): for i, item in enumerate(entry): - for result in _infer_tile_ids_from_nested_list(item, current_pos + (i,)): + for result in _infer_tile_ids_from_nested_list(item, + current_pos + (i,)): yield result else: yield current_pos, entry From 4cfaf2e1fb02dbb390cf64bbdfb37fca562afab9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 19:38:26 +0000 Subject: [PATCH 14/96] Now raises ValueError if input not ordered correctly before concatenation --- xarray/core/combine.py | 59 +++++++++++++++++++++++++----------- xarray/tests/test_combine.py | 8 ++--- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index d724a8dfdce..c0a264f6335 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -374,8 +374,9 @@ def _infer_concat_order_from_positions(datasets, concat_dims): combined_ids = dict(_infer_tile_ids_from_nested_list(datasets, ())) - # Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call - # TODO might be faster in this case to just work out the concat_dims once here + # Currently if concat_dims is not supplied then _auto_concat attempts to + # deduce it on every call + # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) if concat_dims is None or concat_dims == _CONCAT_DIM_DEFAULT: @@ -420,20 +421,39 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): yield current_pos, entry -def _check_shape_tile_ids(combined_tile_ids): - # TODO create custom exception class instead of using asserts? - # Is this function even necessary? - +def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): tile_ids = combined_tile_ids.keys() - # Check all tuples are the same length - lengths = [len(id) for id in tile_ids] - assert set(lengths) == {lengths[0]} + # TODO cover all of these with separate unit tests - # Check only datasets are contained - from .dataset import Dataset - for v in combined_tile_ids.values(): - assert isinstance(v, Dataset) + # Check all tuples are the same length + # i.e. check that all lists are nested to the same depth + nesting_depths = [len(id) for id in tile_ids] + if not set(nesting_depths) == {nesting_depths[0]}: + raise ValueError("The supplied objects do not form a hypercube because" + " sub-lists do not have consistent depths") + + # Check objects form a hypercube + # i.e. check all lists along one dimension are same length, monotonically- + # increasing with no repetitions + for dim in range(nesting_depths[0]): + try: + indices_along_dim = [id[dim] for id in tile_ids] + except IndexError: + raise ValueError("The supplied objects do not form a hypercube " + "because sub-lists do not have consistent " + "lengths along dimension {}".format(str(dim))) + + # TODO work out if this actually means something is wrong + if not set(indices_along_dim) == indices_along_dim: + raise ValueError("The supplied objects do not form a hypercube " + "because there are repeated concatenation " + "positions along concatenation dimension " + "{}".format(str(dim))) + + if not sorted(indices_along_dim) == indices_along_dim: + raise ValueError("The supplied objects have not been successfully " + "ordered along dimension {}".format(str(dim))) def _data_vars(combined_id): @@ -473,6 +493,7 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', # Convert list of tuples back into a dictionary concatenated_ids = dict(tiled_datasets_group) + # TODO refactor this logic, possibly using method in np.blocks # Perform N-D dimensional concatenation # Each iteration of this loop reduces the length of the tile_IDs tuples # by one. It always removes the first @@ -517,10 +538,12 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, if concat_dims is not None: # Arrange datasets for concatenation if infer_order_from_coords: - # Use coordinates to determine tile_ID for each dataset in N-D - # Ignore how they were ordered previously raise NotImplementedError - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) + # TODO Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + # Shoould look like + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, + # concat_dims) else: # Use information from the shape of the user input if not ids: @@ -604,10 +627,10 @@ def auto_combine(datasets, Dataset.merge """ - # TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here? + # TODO do some of _calc_concat_dim_coord's checks on concat_dims here? # The IDs argument tells _auto_combine that the datasets are not yet sorted return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords, infer_order_from_coords=infer_order_from_coords, - ids=False) \ No newline at end of file + ids=False) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 0efeaeec30b..6aff75aefd4 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -532,15 +532,11 @@ def test_concat_twice(self, create_combined_ids): class TestCheckShapeTileIDs(object): + # TODO test all types of ValueErrors from _check_shape_tile_id def test_check_lengths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} - with pytest.raises(AssertionError): - _check_shape_tile_ids(combined_tile_ids) - - def test_check_contains_datasets(self): - combined_tile_ids = {(0,): 'a', (1,): 'b'} - with pytest.raises(AssertionError): + with pytest.raises(ValueError): _check_shape_tile_ids(combined_tile_ids) From 9fd1413692cec1cfd46c3898c413b0c16c822b8a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 22 Nov 2018 10:20:45 +0000 Subject: [PATCH 15/96] Added some more prototype tests defining desired behaviour more clearly --- xarray/tests/test_combine.py | 43 +++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 6aff75aefd4..e129ccb041f 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -462,7 +462,7 @@ def test_ignore_empty_list(self): actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) - def test_ragged_input(self): + def test_uneven_depth_input(self): # Auto_combine won't work on ragged input # but this is just to increase test coverage ds = create_test_data @@ -472,6 +472,17 @@ def test_ragged_input(self): actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) + def test_uneven_length_input(self): + # Auto_combine won't work on ragged input + # but this is just to increase test coverage + ds = create_test_data + input = [[ds(0)], [ds(1), ds(2)]] + + expected = {(0, 0): ds(0), (1, 0): ds(1), (1, 1): ds(2)} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + print(actual) + assert_combined_tile_ids_equal(expected, actual) + def test_infer_from_datasets(self): ds = create_test_data input = [ds(0), ds(1)] @@ -533,10 +544,20 @@ def test_concat_twice(self, create_combined_ids): class TestCheckShapeTileIDs(object): # TODO test all types of ValueErrors from _check_shape_tile_id - def test_check_lengths(self): + def test_check_depths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} - with pytest.raises(ValueError): + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent depths'): + _check_shape_tile_ids(combined_tile_ids) + + def test_check_lengths(self): + ds = create_test_data(0) + combined_tile_ids = {(0, 0): ds, (0, 1): ds, + (1, 0): ds, (1, 1): ds, + (0, 1): ds} + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent lengths'): _check_shape_tile_ids(combined_tile_ids) @@ -556,6 +577,22 @@ def test_auto_combine_2d(self): assert_equal(result, expected) + @pytest.mark.skip def test_ragged_input(self): # TODO should throw an informative error if you try this ... + + def test_combine_redundant_nesting(self): + objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] + actual = auto_combine(objs, concat_dims=[None, 'x']) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + objs = [[Dataset({'x': [0]})], [Dataset({'x': [1]})]] + actual = auto_combine(objs, concat_dims=['x', None]) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + @pytest.mark.skip + def test_mixed_default_concat_dims(self): + ... From 8ad01211609005e455e8f32149578c133ba06f74 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 24 Nov 2018 13:27:12 +0000 Subject: [PATCH 16/96] Now raises informative errors on invalid forms of input --- xarray/core/combine.py | 22 +++++----------------- xarray/tests/test_combine.py | 30 +++++++++++++++++------------- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index c0a264f6335..c829d570124 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -2,6 +2,7 @@ import warnings import toolz.itertoolz as itertoolz +from collections import Counter import pandas as pd @@ -433,28 +434,15 @@ def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): raise ValueError("The supplied objects do not form a hypercube because" " sub-lists do not have consistent depths") - # Check objects form a hypercube - # i.e. check all lists along one dimension are same length, monotonically- - # increasing with no repetitions + # Check all lists along one dimension are same length for dim in range(nesting_depths[0]): - try: - indices_along_dim = [id[dim] for id in tile_ids] - except IndexError: + indices_along_dim = [id[dim] for id in tile_ids] + occurrences = Counter(indices_along_dim) + if len(set(occurrences.values())) != 1: raise ValueError("The supplied objects do not form a hypercube " "because sub-lists do not have consistent " "lengths along dimension {}".format(str(dim))) - # TODO work out if this actually means something is wrong - if not set(indices_along_dim) == indices_along_dim: - raise ValueError("The supplied objects do not form a hypercube " - "because there are repeated concatenation " - "positions along concatenation dimension " - "{}".format(str(dim))) - - if not sorted(indices_along_dim) == indices_along_dim: - raise ValueError("The supplied objects have not been successfully " - "ordered along dimension {}".format(str(dim))) - def _data_vars(combined_id): id, ds = combined_id diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index e129ccb041f..3fa9650c2ca 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -406,6 +406,8 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) +# TODO should we use @requires_dask? only for toolz? + class TestTileIDsFromNestedList(object): def test_1d(self): ds = create_test_data @@ -543,7 +545,6 @@ def test_concat_twice(self, create_combined_ids): class TestCheckShapeTileIDs(object): - # TODO test all types of ValueErrors from _check_shape_tile_id def test_check_depths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} @@ -553,9 +554,8 @@ def test_check_depths(self): def test_check_lengths(self): ds = create_test_data(0) - combined_tile_ids = {(0, 0): ds, (0, 1): ds, - (1, 0): ds, (1, 1): ds, - (0, 1): ds} + combined_tile_ids = {(0, 0): ds, (0, 1): ds , (0, 2): ds, + (1, 0): ds, (1, 1): ds} with raises_regex(ValueError, 'sub-lists do not have ' 'consistent lengths'): _check_shape_tile_ids(combined_tile_ids) @@ -577,12 +577,20 @@ def test_auto_combine_2d(self): assert_equal(result, expected) - @pytest.mark.skip - def test_ragged_input(self): - # TODO should throw an informative error if you try this - ... + def test_invalid_hypercube_input(self): + ds = create_test_data + + datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4)]] + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent lengths'): + auto_combine(datasets, concat_dims=['dim1', 'dim2']) - def test_combine_redundant_nesting(self): + datasets = [[ds(0), ds(1)], [[ds(3), ds(4)]]] + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent depths'): + auto_combine(datasets, concat_dims=['dim1', 'dim2']) + + def test_combine_concat_one_dim_merge_another(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] actual = auto_combine(objs, concat_dims=[None, 'x']) expected = Dataset({'x': [0, 1]}) @@ -592,7 +600,3 @@ def test_combine_redundant_nesting(self): actual = auto_combine(objs, concat_dims=['x', None]) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) - - @pytest.mark.skip - def test_mixed_default_concat_dims(self): - ... From 4b2c5443a637ce5ca1fe1f756921d0898b91cda4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 25 Nov 2018 16:59:42 +0000 Subject: [PATCH 17/96] Refactoring to alos merge along each dimension --- xarray/backends/api.py | 8 ++- xarray/core/combine.py | 124 ++++++++++++++++++++++++----------- xarray/tests/test_combine.py | 19 +++++- 3 files changed, 107 insertions(+), 44 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 672e43e9e58..cc5aad2bfaf 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -601,12 +601,14 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') - # If infer_order_from_coords=True then this is uneccessary, but that's fine + # If infer_order_from_coords=True then this is unnecessary, but that's fine # as it should be quick - in this case it will just loop over one list # If infer_order_from_coords=False then this creates a flat list which is # easier to iterate over, while saving the originally-supplied structure - combined_ids_paths, concat_dims = _infer_concat_order_from_positions(paths, concat_dims) # Use an OrderedDict? - ids, paths = list(combined_ids_paths.keys()), list(combined_ids_paths.values()) # Is this in order?? + combined_ids_paths, concat_dims = _infer_concat_order_from_positions\ + (paths, concat_dims) + ids, paths = list(combined_ids_paths.keys()), \ + list(combined_ids_paths.values()) open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, autoclose=autoclose, **kwargs) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index c829d570124..e0e18c8149d 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -380,8 +380,11 @@ def _infer_concat_order_from_positions(datasets, concat_dims): # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims is None or concat_dims == _CONCAT_DIM_DEFAULT: + if concat_dims == _CONCAT_DIM_DEFAULT: concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims + elif concat_dims is None: + # TODO should this be a list of n Nones? + pass else: if len(concat_dims) != n_dims: raise ValueError("concat_dims has length " + str(len(concat_dims)) @@ -425,8 +428,6 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): tile_ids = combined_tile_ids.keys() - # TODO cover all of these with separate unit tests - # Check all tuples are the same length # i.e. check that all lists are nested to the same depth nesting_depths = [len(id) for id in tile_ids] @@ -449,7 +450,7 @@ def _data_vars(combined_id): return tuple(sorted(ds.data_vars)) -def _combine_nd(combined_IDs, concat_dims, data_vars='all', +def _combine_nd(combined_ids, concat_dims, data_vars='all', coords='different', compat='no_conflicts'): """ Concatenates and merges an N-dimensional structure of datasets. @@ -459,7 +460,7 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', Parameters ---------- - combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset] + combined_ids : Dict[Tuple[int, ...]], xarray.Dataset] Structure containing all datasets to be concatenated with "tile_IDs" as keys, which specify position within the desired final combined result. concat_dims : sequence of str @@ -471,49 +472,94 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', """ + # TODO merge, don't concat if concat_dim is None + + # TODO refactor this logic, possibly using method in np.blocks + # Perform N-D dimensional concatenation + # Each iteration of this loop reduces the length of the tile_IDs tuples + # by one. It always removes the first + for concat_dim in concat_dims: + dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + + combined_ids = _auto_combine_along_first_dim(combined_ids, + dim=dim, + data_vars=data_vars, + coords=coords, + compat=compat) + combined_ds = next(iter(combined_ids.values())) + #print(combined_ds) + return combined_ds + + +def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, + coords, compat): # Organise by data variables grouped_by_data_vars = itertoolz.groupby(_data_vars, - combined_IDs.items()).values() + combined_ids.items()).values() - concatenated_datasets = [] + new_combined_ids_all_vars = [] for tiled_datasets_group in grouped_by_data_vars: - # Convert list of tuples back into a dictionary - concatenated_ids = dict(tiled_datasets_group) + # Groupby returns list of tuples - convert back into a dictionary + combined_ids = dict(tiled_datasets_group) - # TODO refactor this logic, possibly using method in np.blocks - # Perform N-D dimensional concatenation - # Each iteration of this loop reduces the length of the tile_IDs tuples - # by one. It always removes the first - for concat_dim in concat_dims: - dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + # Group into lines of datasets which must be combined along dim + grouped = itertoolz.groupby(_new_tile_id, combined_ids.items()) - concatenated_ids = _concat_along_first_dim(concatenated_ids, - dim=dim, - data_vars=data_vars, - coords=coords) - concatenated_datasets = concatenated_datasets \ - + list(concatenated_ids.values()) - return merge(concatenated_datasets, compat=compat) + new_combined_ids = {} + for new_id, group in grouped.items(): + new_combined_ids[new_id] = _auto_combine_1d(group, dim, data_vars, + coords, compat) + print("New combined IDs:") + print(new_combined_ids) + new_combined_ids_all_vars.append(new_combined_ids) -def _new_tile_id(single_id_ds_pair): - # TODO maybe replace with something like lambda x: x[0][1:]? - tile_id, ds = single_id_ds_pair - return tile_id[1:] + # merge the new_combined_ids dicts by using xarray.merge on elements with the same key + return _merge_by_new_ids(new_combined_ids_all_vars, compat=compat) + + +def _auto_combine_1d(to_combine, dim, data_vars, coords, compat): + if dim is not None: + combined = _auto_concat(to_combine, dim=dim, data_vars=data_vars, + coords=coords) + else: + print(to_combine) + combined = merge(to_combine, compat=compat) + + return combined + + +def _merge_by_new_ids(new_combined_ids_all_vars, compat): + # Merge different variables back together + # Merging a list of dicts + # Group by indexes + grouped_by_new_id = itertoolz.groupby(_tile_id, new_combined_ids_all_vars) + + print("Grouped by new ID:") + print(grouped_by_new_id) + + # Merge different variables back together, while retaining new tile IDs + merged_new_combined_ids = {} + for tile_id, group in grouped_by_new_id.items(): + + print(group) + to_merge = [list(combined_ds.values())[0] for combined_ds in group] + print(to_merge) + merged = merge(to_merge, compat=compat) + merged_new_combined_ids[tile_id] = merged + return merged_new_combined_ids -def _concat_along_first_dim(combined_IDs, dim, data_vars='all', - coords='different'): - grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) - new_combined_IDs = {} - for new_ID, group in grouped.items(): - to_concat = [ds for old_ID, ds in group] - new_combined_IDs[new_ID] = _auto_concat(to_concat, dim=dim, - data_vars=data_vars, - coords=coords) - return new_combined_IDs +def _tile_id(new_combined_ids): + return next(iter(new_combined_ids.keys())) + + +def _new_tile_id(single_id_ds_pair): + # TODO maybe replace with something like lambda x: x[0][1:]? + tile_id, ds = single_id_ds_pair + return tile_id[1:] def _auto_combine(datasets, concat_dims, compat, data_vars, coords, @@ -529,7 +575,7 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, raise NotImplementedError # TODO Use coordinates to determine tile_ID for each dataset in N-D # Ignore how they were ordered previously - # Shoould look like + # Should look like # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, # concat_dims) else: @@ -546,11 +592,13 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, # Check that the combined_ids are sensible _check_shape_tile_ids(combined_ids) + print(concat_dims) + # Repeatedly concatenate then merge along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, data_vars=data_vars, coords=coords) else: - # Case of no concatenation wanted + # Case of no concatenation wanted at all concatenated = datasets combined = merge(concatenated, compat=compat) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 3fa9650c2ca..dbd2c070251 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -10,7 +10,7 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( - _new_tile_id, _concat_along_first_dim, + _new_tile_id, _auto_combine_along_first_dim, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, _check_shape_tile_ids, _combine_nd) @@ -386,6 +386,13 @@ def test_auto_combine_no_concat(self): actual = auto_combine([data, data, data], concat_dims=None) assert_identical(data, actual) + tmp1 = Dataset({'x': 0}) + tmp2 = Dataset({'x': np.nan}) + actual = auto_combine([tmp1, tmp2], concat_dims=None) + assert_identical(tmp1, actual) + actual = auto_combine([tmp1, tmp2], concat_dims=[None]) + assert_identical(tmp1, actual) + # Single object, with a concat_dim explicitly provided # Test the issue reported in GH #1988 objs = [Dataset({'x': 0, 'y': 1})] @@ -520,12 +527,18 @@ def test_get_new_tile_ids(self, create_combined_ids): expected_new_tile_id = tile_id[1:] assert _new_tile_id(combined) == expected_new_tile_id + def test_merge_by_new_ids(self): + ... + @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) combined_ids = create_combined_ids(shape) ds = create_test_data - result = _concat_along_first_dim(combined_ids, dim=concat_dim) + result = _auto_combine_along_first_dim(combined_ids, dim=concat_dim, + data_vars='all', + coords='different', + compat='no_conflicts') expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) @@ -590,7 +603,7 @@ def test_invalid_hypercube_input(self): 'consistent depths'): auto_combine(datasets, concat_dims=['dim1', 'dim2']) - def test_combine_concat_one_dim_merge_another(self): + def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] actual = auto_combine(objs, concat_dims=[None, 'x']) expected = Dataset({'x': [0, 1]}) From 3d0061e382dc51badc1f731eb53df8f42df9a7c1 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 25 Nov 2018 17:57:05 +0000 Subject: [PATCH 18/96] Refactored to literally just apply the old auto_combine along each dimension --- xarray/core/combine.py | 116 +++++++++++------------------------ xarray/tests/test_combine.py | 51 ++++++++------- 2 files changed, 64 insertions(+), 103 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index e0e18c8149d..b476e77e850 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -380,11 +380,8 @@ def _infer_concat_order_from_positions(datasets, concat_dims): # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims == _CONCAT_DIM_DEFAULT: - concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims - elif concat_dims is None: - # TODO should this be a list of n Nones? - pass + if concat_dims == _CONCAT_DIM_DEFAULT or None: + concat_dims = [concat_dims]*n_dims else: if len(concat_dims) != n_dims: raise ValueError("concat_dims has length " + str(len(concat_dims)) @@ -428,6 +425,8 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): tile_ids = combined_tile_ids.keys() + # TODO a check that only the expected types of objects are contained + # Check all tuples are the same length # i.e. check that all lists are nested to the same depth nesting_depths = [len(id) for id in tile_ids] @@ -472,93 +471,54 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', """ - # TODO merge, don't concat if concat_dim is None - # TODO refactor this logic, possibly using method in np.blocks # Perform N-D dimensional concatenation # Each iteration of this loop reduces the length of the tile_IDs tuples # by one. It always removes the first for concat_dim in concat_dims: - dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - - combined_ids = _auto_combine_along_first_dim(combined_ids, - dim=dim, - data_vars=data_vars, - coords=coords, - compat=compat) + combined_ids = _auto_combine_all_along_first_dim(combined_ids, + dim=concat_dim, + data_vars=data_vars, + coords=coords, + compat=compat) combined_ds = next(iter(combined_ids.values())) - #print(combined_ds) return combined_ds def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat): - # Organise by data variables - grouped_by_data_vars = itertoolz.groupby(_data_vars, - combined_ids.items()).values() - - new_combined_ids_all_vars = [] - for tiled_datasets_group in grouped_by_data_vars: - - # Groupby returns list of tuples - convert back into a dictionary - combined_ids = dict(tiled_datasets_group) + # Group into lines of datasets which must be combined along dim + grouped = itertoolz.groupby(_new_tile_id, combined_ids.items()) - # Group into lines of datasets which must be combined along dim - grouped = itertoolz.groupby(_new_tile_id, combined_ids.items()) + new_combined_ids = {} + for new_id, group in grouped.items(): + combined_ids = dict(group) + datasets = list(combined_ids.values()) + new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, + data_vars, coords) - new_combined_ids = {} - for new_id, group in grouped.items(): - new_combined_ids[new_id] = _auto_combine_1d(group, dim, data_vars, - coords, compat) + return new_combined_ids - print("New combined IDs:") - print(new_combined_ids) - new_combined_ids_all_vars.append(new_combined_ids) - # merge the new_combined_ids dicts by using xarray.merge on elements with the same key - return _merge_by_new_ids(new_combined_ids_all_vars, compat=compat) - - -def _auto_combine_1d(to_combine, dim, data_vars, coords, compat): - if dim is not None: - combined = _auto_concat(to_combine, dim=dim, data_vars=data_vars, - coords=coords) +def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', + data_vars='all', coords='different'): + # This is just the old auto_combine function (which only worked along 1D) + if concat_dim is not None: + dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), + datasets).values() + concatenated = [_auto_concat(ds, dim=dim, + data_vars=data_vars, coords=coords) + for ds in grouped] else: - print(to_combine) - combined = merge(to_combine, compat=compat) - - return combined - - -def _merge_by_new_ids(new_combined_ids_all_vars, compat): - # Merge different variables back together - # Merging a list of dicts - # Group by indexes - grouped_by_new_id = itertoolz.groupby(_tile_id, new_combined_ids_all_vars) - - print("Grouped by new ID:") - print(grouped_by_new_id) - - # Merge different variables back together, while retaining new tile IDs - merged_new_combined_ids = {} - for tile_id, group in grouped_by_new_id.items(): - - print(group) - to_merge = [list(combined_ds.values())[0] for combined_ds in group] - print(to_merge) - merged = merge(to_merge, compat=compat) - merged_new_combined_ids[tile_id] = merged - - return merged_new_combined_ids - - -def _tile_id(new_combined_ids): - return next(iter(new_combined_ids.keys())) + concatenated = datasets + merged = merge(concatenated, compat=compat) + return merged def _new_tile_id(single_id_ds_pair): - # TODO maybe replace with something like lambda x: x[0][1:]? - tile_id, ds = single_id_ds_pair + tile_id, ds = single_id_ds_pair return tile_id[1:] @@ -589,19 +549,15 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, # Already sorted so just use the ids already passed combined_ids = dict(zip(ids, datasets)) - # Check that the combined_ids are sensible + # Check that the inferred shape is combinable _check_shape_tile_ids(combined_ids) - print(concat_dims) - # Repeatedly concatenate then merge along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, data_vars=data_vars, coords=coords) else: # Case of no concatenation wanted at all - concatenated = datasets - combined = merge(concatenated, compat=compat) - + combined = merge(datasets, compat=compat) return combined @@ -663,8 +619,6 @@ def auto_combine(datasets, Dataset.merge """ - # TODO do some of _calc_concat_dim_coord's checks on concat_dims here? - # The IDs argument tells _auto_combine that the datasets are not yet sorted return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords, diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index dbd2c070251..8abe0ac0f5d 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -10,9 +10,9 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( - _new_tile_id, _auto_combine_along_first_dim, + _new_tile_id, _auto_combine_all_along_first_dim, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, - _check_shape_tile_ids, _combine_nd) + _check_shape_tile_ids, _combine_nd, _auto_combine_1d) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -302,42 +302,41 @@ def test_concat_lazy(self): class TestAutoCombine(object): + @pytest.mark.parametrize("combine", [_auto_combine_1d, auto_combine]) @requires_dask # only for toolz - def test_auto_combine(self): + def test_auto_combine(self, combine): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] - actual = auto_combine(objs) + actual = combine(objs) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) - actual = auto_combine(actual) - assert_identical(expected, actual) - - actual = auto_combine([actual]) + actual = combine([actual]) assert_identical(expected, actual) objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] - actual = auto_combine(objs) + actual = combine(objs) expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) + # TODO find out why this fails!! # ensure auto_combine handles non-sorted variables objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] - actual = auto_combine(objs) + actual = combine(objs) expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) assert_identical(expected, actual) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] with raises_regex(ValueError, 'too many .* dimensions'): - auto_combine(objs) + combine(objs) objs = [Dataset({'x': 0}), Dataset({'x': 1})] with raises_regex(ValueError, 'cannot infer dimension'): - auto_combine(objs) + combine(objs) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] with pytest.raises(KeyError): - auto_combine(objs) + combine(objs) @requires_dask # only for toolz def test_auto_combine_previously_failed(self): @@ -413,8 +412,6 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) -# TODO should we use @requires_dask? only for toolz? - class TestTileIDsFromNestedList(object): def test_1d(self): ds = create_test_data @@ -518,6 +515,7 @@ def _create_tile_ids(shape): return list(tile_ids) +@requires_dask # only for toolz class TestCombineND(object): def test_get_new_tile_ids(self, create_combined_ids): shape = (1, 2, 3) @@ -527,18 +525,16 @@ def test_get_new_tile_ids(self, create_combined_ids): expected_new_tile_id = tile_id[1:] assert _new_tile_id(combined) == expected_new_tile_id - def test_merge_by_new_ids(self): - ... - @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) combined_ids = create_combined_ids(shape) ds = create_test_data - result = _auto_combine_along_first_dim(combined_ids, dim=concat_dim, - data_vars='all', - coords='different', - compat='no_conflicts') + result = _auto_combine_all_along_first_dim(combined_ids, + dim=concat_dim, + data_vars='all', + coords='different', + compat='no_conflicts') expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) @@ -574,9 +570,20 @@ def test_check_lengths(self): _check_shape_tile_ids(combined_tile_ids) +@requires_dask # only for toolz class TestAutoCombineND(object): # TODO there should be a lot more tests in here testing different cases + def test_single_dataset(self): + + objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + actual = auto_combine(actual) + assert_identical(expected, actual) + def test_auto_combine_2d(self): ds = create_test_data From 60c93ba59fd6636a9f6ebe2bb460b9924d9ebec7 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 26 Nov 2018 11:28:23 +0000 Subject: [PATCH 19/96] Added unit tests for open_mfdatset --- xarray/backends/api.py | 13 ++++--- xarray/core/combine.py | 8 ++--- xarray/tests/test_backends.py | 65 +++++++++++++++++++++++++++++++++++ xarray/tests/test_combine.py | 10 +++++- 4 files changed, 84 insertions(+), 12 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index cc5aad2bfaf..45ceb814a18 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -503,12 +503,12 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details [2]. - concat_dims : None, str, DataArray or Index, optional - Dimension to concatenate files along. This argument is passed on to + concat_dims : list of str, DataArray, Index or None, optional + Dimensions to concatenate files along. This argument is passed on to :py:func:`xarray.auto_combine` along with the dataset objects. You only - need to provide this argument if the dimension along which you want to - concatenate is not a dimension in the original datasets, e.g., if you - want to stack a collection of 2D arrays along a third dimension. + need to provide this argument if any of the dimensions along which you + want to concatenate is not a dimension in the original datasets, e.g., + if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=None`` explicitly to disable concatenation. @@ -601,8 +601,7 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') - # If infer_order_from_coords=True then this is unnecessary, but that's fine - # as it should be quick - in this case it will just loop over one list + # If infer_order_from_coords=True then this is unnecessary, but quick. # If infer_order_from_coords=False then this creates a flat list which is # easier to iterate over, while saving the originally-supplied structure combined_ids_paths, concat_dims = _infer_concat_order_from_positions\ diff --git a/xarray/core/combine.py b/xarray/core/combine.py index b476e77e850..bf44997e9f6 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -380,7 +380,7 @@ def _infer_concat_order_from_positions(datasets, concat_dims): # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims == _CONCAT_DIM_DEFAULT or None: + if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims == None: concat_dims = [concat_dims]*n_dims else: if len(concat_dims) != n_dims: @@ -468,7 +468,7 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', Returns ------- - + combined_ds : xarray.Dataset """ # TODO refactor this logic, possibly using method in np.blocks @@ -481,7 +481,7 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', data_vars=data_vars, coords=coords, compat=compat) - combined_ds = next(iter(combined_ids.values())) + combined_ds = list(combined_ids.values())[0] return combined_ds @@ -576,7 +576,7 @@ def auto_combine(datasets, ---------- datasets : sequence of xarray.Dataset Dataset objects to merge. - concat_dims : list of str or DataArray or Index, optional + concat_dims : list of str, DataArray, Index or None, optional Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. You only need to provide this argument if any of the dimensions along which you want to concatenate are not a diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d9109c4b67f..42893cf5d9e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2135,12 +2135,42 @@ def test_open_mfdataset(self): assert actual.foo.variable.data.chunks == \ ((3, 2, 3, 2),) + with raises_regex(IOError, 'no files to open'): open_mfdataset('foo-bar-baz-*.nc') with raises_regex(ValueError, 'wild-card'): open_mfdataset('http://some/remote/uri') + def test_open_mfdataset_2d(self): + original = Dataset({'foo': (['x', 'y'], np.random.randn(10, 8))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + with create_tmp_file() as tmp3: + with create_tmp_file() as tmp4: + original.isel(x=slice(5), + y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), + y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), + y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), + y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset([[tmp1, tmp2], + [tmp3, tmp4]], + concat_dims=['y', 'x']) as actual: + assert isinstance(actual.foo.variable.data, + da.Array) + assert actual.foo.variable.data.chunks == \ + ((5, 5), (4, 4)) + assert_identical(original, actual) + with open_mfdataset([[tmp1, tmp2], + [tmp3, tmp4]], + concat_dims=['y', 'x'], + chunks={'x': 3, 'y': 2}) as actual: + assert actual.foo.variable.data.chunks == \ + ((3, 2, 3, 2), (2, 2, 2, 2),) + @requires_pathlib def test_open_mfdataset_pathlib(self): original = Dataset({'foo': ('x', np.random.randn(10))}) @@ -2153,6 +2183,41 @@ def test_open_mfdataset_pathlib(self): with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(original, actual) + @requires_pathlib + def test_open_mfdataset_2d_pathlib(self): + original = Dataset({'foo': (['x', 'y'], np.random.randn(10, 8))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + with create_tmp_file() as tmp3: + with create_tmp_file() as tmp4: + tmp1 = Path(tmp1) + tmp2 = Path(tmp2) + tmp3 = Path(tmp3) + tmp4 = Path(tmp4) + original.isel(x=slice(5), + y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), + y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), + y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), + y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset([[tmp1, tmp2], + [tmp3, tmp4]], + concat_dims=['y', 'x']) as actual: + assert_identical(original, actual) + + @pytest.mark.xfail(reason="Not yet implemented") + def test_open_mfdataset(self): + original = Dataset({'foo': ('x', np.random.randn(10))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + original.isel(x=slice(5)).to_netcdf(tmp1) + original.isel(x=slice(5, 10)).to_netcdf(tmp2) + with open_mfdataset([tmp1, tmp2], + infer_order_from_coords=True) as actual: + assert_identical(original, actual) + def test_attrs_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp1: diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 8abe0ac0f5d..0edc715698a 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -318,7 +318,6 @@ def test_auto_combine(self, combine): expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) - # TODO find out why this fails!! # ensure auto_combine handles non-sorted variables objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] @@ -411,6 +410,15 @@ def test_auto_combine_no_concat(self): {'baz': [100]}) assert_identical(expected, actual) + @pytest.mark.xfail(reason="Not yet implemented") + def test_infer_order_from_coords(self): + data = create_test_data() + print(data) + objs = [data.isel(x=slice(4, 9)), data.isel(x=slice(4))] + actual = auto_combine(objs, infer_order_from_coords=True) + expected = data + assert_identical(expected, actual) + class TestTileIDsFromNestedList(object): def test_1d(self): From 18245384c12d77177c0c230b7826739ca5089234 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 26 Nov 2018 13:31:51 +0000 Subject: [PATCH 20/96] Removed TODOs --- xarray/core/combine.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index bf44997e9f6..2d5da1053fb 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -375,9 +375,6 @@ def _infer_concat_order_from_positions(datasets, concat_dims): combined_ids = dict(_infer_tile_ids_from_nested_list(datasets, ())) - # Currently if concat_dims is not supplied then _auto_concat attempts to - # deduce it on every call - # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims == None: @@ -422,11 +419,9 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): yield current_pos, entry -def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): +def _check_shape_tile_ids(combined_tile_ids): tile_ids = combined_tile_ids.keys() - # TODO a check that only the expected types of objects are contained - # Check all tuples are the same length # i.e. check that all lists are nested to the same depth nesting_depths = [len(id) for id in tile_ids] From d380815de26510fd977269e35b8b9648db279769 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 18:58:02 +0000 Subject: [PATCH 21/96] Removed format strings --- xarray/core/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 2d5da1053fb..d81849e037b 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -436,7 +436,7 @@ def _check_shape_tile_ids(combined_tile_ids): if len(set(occurrences.values())) != 1: raise ValueError("The supplied objects do not form a hypercube " "because sub-lists do not have consistent " - "lengths along dimension {}".format(str(dim))) + "lengths along dimension" + str(dim)) def _data_vars(combined_id): From c4bb8d0396498154da3521ffe7c9a81a4199cf7b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 20:07:54 +0000 Subject: [PATCH 22/96] test_get_new_tile_ids now doesn't assume dicts are ordered --- xarray/tests/test_combine.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 0edc715698a..b4c3c36f46a 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from xarray import DataArray, Dataset, Variable, auto_combine, concat +from xarray import DataArray, Dataset, Variable, auto_combine, concat, merge from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( _new_tile_id, _auto_combine_all_along_first_dim, @@ -529,9 +529,9 @@ def test_get_new_tile_ids(self, create_combined_ids): shape = (1, 2, 3) combined_ids = create_combined_ids(shape) - for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): - expected_new_tile_id = tile_id[1:] - assert _new_tile_id(combined) == expected_new_tile_id + expected_tile_ids = sorted(combined_ids.keys()) + actual_tile_ids = _create_tile_ids(shape) + assert expected_tile_ids == actual_tile_ids @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): @@ -547,16 +547,36 @@ def test_concat_once(self, create_combined_ids, concat_dim): expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) - def test_concat_twice(self, create_combined_ids): + def test_concat_only_first_dim(self, create_combined_ids): shape = (2, 3) combined_ids = create_combined_ids(shape) - result = _combine_nd(combined_ids, concat_dims=['dim1', 'dim2']) + ds = create_test_data + result = _auto_combine_all_along_first_dim(combined_ids, + dim='dim1', + data_vars='all', + coords='different', + compat='no_conflicts') ds = create_test_data partway1 = concat([ds(0), ds(3)], dim='dim1') partway2 = concat([ds(1), ds(4)], dim='dim1') partway3 = concat([ds(2), ds(5)], dim='dim1') - expected = concat([partway1, partway2, partway3], dim='dim2') + expected_datasets = [partway1, partway2, partway3] + expected = {(i,): ds for i, ds in enumerate(expected_datasets)} + + assert_combined_tile_ids_equal(result, expected) + + @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) + def test_concat_twice(self, create_combined_ids, concat_dim): + shape = (2, 3) + combined_ids = create_combined_ids(shape) + result = _combine_nd(combined_ids, concat_dims=['dim1', concat_dim]) + + ds = create_test_data + partway1 = concat([ds(0), ds(3)], dim='dim1') + partway2 = concat([ds(1), ds(4)], dim='dim1') + partway3 = concat([ds(2), ds(5)], dim='dim1') + expected = concat([partway1, partway2, partway3], dim=concat_dim) assert_equal(result, expected) From 6b7f8891c9e3e113b511fded3e6b7d2801e9873e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 21:08:42 +0000 Subject: [PATCH 23/96] Fixed failing tests on python3.5 caused by accidentally assuming dict was ordered --- xarray/core/combine.py | 7 ++++--- xarray/tests/test_combine.py | 38 ++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index d81849e037b..aaef8cd4162 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -2,7 +2,7 @@ import warnings import toolz.itertoolz as itertoolz -from collections import Counter +from collections import Counter, OrderedDict import pandas as pd @@ -470,6 +470,7 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', # Perform N-D dimensional concatenation # Each iteration of this loop reduces the length of the tile_IDs tuples # by one. It always removes the first + for concat_dim in concat_dims: combined_ids = _auto_combine_all_along_first_dim(combined_ids, dim=concat_dim, @@ -487,11 +488,11 @@ def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, new_combined_ids = {} for new_id, group in grouped.items(): - combined_ids = dict(group) + # TODO is there a way to unpack this object without using OrderedDict? + combined_ids = OrderedDict(sorted(group)) datasets = list(combined_ids.values()) new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, data_vars, coords) - return new_combined_ids diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index b4c3c36f46a..d44ba20ba1c 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -12,7 +12,7 @@ from xarray.core.combine import ( _new_tile_id, _auto_combine_all_along_first_dim, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, - _check_shape_tile_ids, _combine_nd, _auto_combine_1d) + _check_shape_tile_ids, _combine_nd, _auto_combine_1d, _auto_combine) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -410,15 +410,6 @@ def test_auto_combine_no_concat(self): {'baz': [100]}) assert_identical(expected, actual) - @pytest.mark.xfail(reason="Not yet implemented") - def test_infer_order_from_coords(self): - data = create_test_data() - print(data) - objs = [data.isel(x=slice(4, 9)), data.isel(x=slice(4))] - actual = auto_combine(objs, infer_order_from_coords=True) - expected = data - assert_identical(expected, actual) - class TestTileIDsFromNestedList(object): def test_1d(self): @@ -515,7 +506,8 @@ def create_combined_ids(): def _create_combined_ids(shape): tile_ids = _create_tile_ids(shape) nums = range(len(tile_ids)) - return {tile_id: create_test_data(num) for tile_id, num in zip(tile_ids, nums)} + return {tile_id: create_test_data(num) + for tile_id, num in zip(tile_ids, nums)} def _create_tile_ids(shape): @@ -551,6 +543,7 @@ def test_concat_only_first_dim(self, create_combined_ids): shape = (2, 3) combined_ids = create_combined_ids(shape) ds = create_test_data + print(combined_ids) result = _auto_combine_all_along_first_dim(combined_ids, dim='dim1', data_vars='all', @@ -563,7 +556,7 @@ def test_concat_only_first_dim(self, create_combined_ids): partway3 = concat([ds(2), ds(5)], dim='dim1') expected_datasets = [partway1, partway2, partway3] expected = {(i,): ds for i, ds in enumerate(expected_datasets)} - + assert_combined_tile_ids_equal(result, expected) @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) @@ -648,3 +641,24 @@ def test_combine_concat_over_redundant_nesting(self): actual = auto_combine(objs, concat_dims=['x', None]) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) + + +# TODO test for _new_tile_id + +class TestAutoCombineUsingCoords(object): + @pytest.mark.xfail(reason="Not yet implemented") + def test_infer_order_from_coords(self): + # Should pass once inferring order from coords is implemented + data = create_test_data() + objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] + actual = auto_combine(objs, infer_order_from_coords=True) + expected = data + assert_identical(expected, actual) + + def test_order_inferred_from_coords(self): + data = create_test_data() + objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] + with pytest.raises(NotImplementedError): + _auto_combine(objs, concat_dims=['dim2'],compat='no_conflicts', + data_vars='all', coords='different', + infer_order_from_coords=True, ids=True) From 58a3648cc2b4a7465f5cacdaa8e6eb67e54ff583 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 21:21:23 +0000 Subject: [PATCH 24/96] Test for getting new tile id --- xarray/tests/test_combine.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index d44ba20ba1c..87d4efb57f5 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -517,6 +517,15 @@ def _create_tile_ids(shape): @requires_dask # only for toolz class TestCombineND(object): + @pytest.mark.parametrize("old_id, new_id", [((3,0,1), (0,1)), + ((0, 0), (0,)), + ((1,), ()), + ((0,), ()), + ((1, 0), (0,))]) + def test_new_tile_id(self, old_id, new_id): + ds = create_test_data + assert _new_tile_id((old_id, ds)) == new_id + def test_get_new_tile_ids(self, create_combined_ids): shape = (1, 2, 3) combined_ids = create_combined_ids(shape) @@ -643,8 +652,6 @@ def test_combine_concat_over_redundant_nesting(self): assert_identical(expected, actual) -# TODO test for _new_tile_id - class TestAutoCombineUsingCoords(object): @pytest.mark.xfail(reason="Not yet implemented") def test_infer_order_from_coords(self): From a12a34a9be51fb16cfebcf13ad0ccb0ef26a4edd Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 21:51:38 +0000 Subject: [PATCH 25/96] Fixed itertoolz import so that it's compatible with older versions --- xarray/core/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index aaef8cd4162..274659b2d61 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division, print_function import warnings -import toolz.itertoolz as itertoolz +from toolz import itertoolz from collections import Counter, OrderedDict import pandas as pd From ada1f4a4ef501acb571c1724a84f3da2e9b05ac2 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 13:43:54 +0000 Subject: [PATCH 26/96] Increased test coverage --- xarray/core/combine.py | 7 +------ xarray/tests/test_backends.py | 8 ++++++-- xarray/tests/test_combine.py | 34 +++++++++++++++++++++++++--------- 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 274659b2d61..cf832b1a9d2 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -2,7 +2,7 @@ import warnings from toolz import itertoolz -from collections import Counter, OrderedDict +from collections import Counter import pandas as pd @@ -439,11 +439,6 @@ def _check_shape_tile_ids(combined_tile_ids): "lengths along dimension" + str(dim)) -def _data_vars(combined_id): - id, ds = combined_id - return tuple(sorted(ds.data_vars)) - - def _combine_nd(combined_ids, concat_dims, data_vars='all', coords='different', compat='no_conflicts'): """ diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 42893cf5d9e..2dc950b9234 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2214,8 +2214,12 @@ def test_open_mfdataset(self): with create_tmp_file() as tmp2: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], - infer_order_from_coords=True) as actual: + + with pytest.raises(NotImplementedError): + open_mfdataset([tmp1, tmp2], infer_order_from_coords=True) + + # With infer_order_from_coords=True this should pass in future + with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(original, actual) def test_attrs_mfdataset(self): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 87d4efb57f5..9258dc0b5ee 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -497,6 +497,10 @@ def test_infer_from_datasets(self): (input, ['dim1']) assert_combined_tile_ids_equal(expected, actual) + input = [ds(0), ds(1)] + with pytest.raises(ValueError): + _infer_concat_order_from_positions(input, ['dim1', 'extra_dim']) + @pytest.fixture(scope='module') def create_combined_ids(): @@ -602,10 +606,7 @@ def test_check_lengths(self): @requires_dask # only for toolz class TestAutoCombineND(object): - # TODO there should be a lot more tests in here testing different cases - def test_single_dataset(self): - objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] actual = auto_combine(objs) expected = Dataset({'x': [0, 1]}) @@ -640,6 +641,10 @@ def test_invalid_hypercube_input(self): 'consistent depths'): auto_combine(datasets, concat_dims=['dim1', 'dim2']) + datasets = [[ds(0), ds(1)], [ds(3), ds(4)]] + with raises_regex(ValueError, 'concat_dims has length'): + auto_combine(datasets, concat_dims=['dim1']) + def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] actual = auto_combine(objs, concat_dims=[None, 'x']) @@ -651,16 +656,18 @@ def test_combine_concat_over_redundant_nesting(self): expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) + objs = [[Dataset({'x': [0]})]] + actual = auto_combine(objs, concat_dims=[None, None]) + expected = Dataset({'x': [0]}) + assert_identical(expected, actual) + class TestAutoCombineUsingCoords(object): - @pytest.mark.xfail(reason="Not yet implemented") - def test_infer_order_from_coords(self): - # Should pass once inferring order from coords is implemented + def test_infer_order_from_coords_not_implemented(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - actual = auto_combine(objs, infer_order_from_coords=True) - expected = data - assert_identical(expected, actual) + with pytest.raises(NotImplementedError): + auto_combine(objs, infer_order_from_coords=True) def test_order_inferred_from_coords(self): data = create_test_data() @@ -669,3 +676,12 @@ def test_order_inferred_from_coords(self): _auto_combine(objs, concat_dims=['dim2'],compat='no_conflicts', data_vars='all', coords='different', infer_order_from_coords=True, ids=True) + + @pytest.mark.xfail(reason="Not yet implemented") + def test_infer_order_from_coords(self): + # Should pass once inferring order from coords is implemented + data = create_test_data() + objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] + actual = auto_combine(objs) # but with infer_order_from_coords=True + expected = data + assert_identical(expected, actual) From ef0a30e789521e03a7f5322f488be1dafc208481 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 15:15:11 +0000 Subject: [PATCH 27/96] Added toolz as an explicit dependency to pass tests on python2.7 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3b56d9265af..ebc69b18f1c 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ 'Topic :: Scientific/Engineering', ] -INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2'] +INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2', 'toolz >= 0.9.0'] TESTS_REQUIRE = ['pytest >= 2.7.1'] if sys.version_info[0] < 3: TESTS_REQUIRE.append('mock') From 3be70bc2f181c1bc1cf04dc4da0154854bc78239 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 15:40:06 +0000 Subject: [PATCH 28/96] Updated 'what's new' --- doc/whats-new.rst | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1da1da700e7..1d021a7f98e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,7 +21,7 @@ What's New always be available to python 2.7 users. For more information see the following references - - `Xarray Github issue discussing dropping Python 2 `__ + - `Xarray Github issue discussing dropping Python 2 `__ - `Python 3 Statement `__ - `Tips on porting to Python 3 `__ @@ -33,6 +33,19 @@ v0.11.1 (unreleased) Breaking changes ~~~~~~~~~~~~~~~~ +- Auto-combine along N dimensions: + + - ``open_mfdataset`` and ``auto_combine`` can now combine datasets along any + number of dimensions, instead of just a 1D list of datasets. To combine + along multiple dimensions the datasets must be passed as a nested + list-of-lists. + + Breaking because ``open_mfdataset`` and ``auto_combine`` now expect an + argument ``concat_dims`` instead of ``concat_dim``. ``concat_dims`` accepts + a list of valid ``concat_dim`` arguments, e.g. ``['dim1', 'dim2']``. + (:issue:`2159`) + By `Tom Nicholas `_. + Enhancements ~~~~~~~~~~~~ From f266bc36f082951f2f6b418f57b49ea45668f240 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 15:52:40 +0000 Subject: [PATCH 29/96] No longer attempts to shortcut all concatenation at once if concat_dims=None --- xarray/core/combine.py | 53 ++++++++++++++++-------------------- xarray/tests/test_combine.py | 5 ++++ 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index cf832b1a9d2..668348ccafe 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -377,7 +377,7 @@ def _infer_concat_order_from_positions(datasets, concat_dims): tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims == None: + if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims is None: concat_dims = [concat_dims]*n_dims else: if len(concat_dims) != n_dims: @@ -516,39 +516,34 @@ def _new_tile_id(single_id_ds_pair): def _auto_combine(datasets, concat_dims, compat, data_vars, coords, infer_order_from_coords, ids): """ - This function decides if any concatenation is necessary, and if so it calls - the logic to decide their concatenation order before concatenating. + Calls logic to decide concatenation order before concatenating. """ - if concat_dims is not None: - # Arrange datasets for concatenation - if infer_order_from_coords: - raise NotImplementedError - # TODO Use coordinates to determine tile_ID for each dataset in N-D - # Ignore how they were ordered previously - # Should look like - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, - # concat_dims) - else: - # Use information from the shape of the user input - if not ids: - # Determine tile_IDs by structure of input in N-D - # (i.e. ordering in list-of-lists) - combined_ids, concat_dims = _infer_concat_order_from_positions\ + # Arrange datasets for concatenation + if infer_order_from_coords: + raise NotImplementedError + # TODO Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + # Should look like: + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, + # concat_dims) + else: + # Use information from the shape of the user input + if not ids: + # Determine tile_IDs by structure of input in N-D + # (i.e. ordering in list-of-lists) + combined_ids, concat_dims = _infer_concat_order_from_positions\ (datasets, concat_dims) - else: - # Already sorted so just use the ids already passed - combined_ids = dict(zip(ids, datasets)) + else: + # Already sorted so just use the ids already passed + combined_ids = dict(zip(ids, datasets)) - # Check that the inferred shape is combinable - _check_shape_tile_ids(combined_ids) + # Check that the inferred shape is combinable + _check_shape_tile_ids(combined_ids) - # Repeatedly concatenate then merge along each dimension - combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords) - else: - # Case of no concatenation wanted at all - combined = merge(datasets, compat=compat) + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords) return combined diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 9258dc0b5ee..4225099aa05 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -661,6 +661,11 @@ def test_combine_concat_over_redundant_nesting(self): expected = Dataset({'x': [0]}) assert_identical(expected, actual) + objs = [[Dataset({'x': [0]})]] + actual = auto_combine(objs, concat_dims=None) + expected = Dataset({'x': [0]}) + assert_identical(expected, actual) + class TestAutoCombineUsingCoords(object): def test_infer_order_from_coords_not_implemented(self): From 878e1f9d3650d0e090acc9d9395015d9046fb950 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 20:07:50 +0000 Subject: [PATCH 30/96] Rewrote using itertools.groupby instead of toolz.itertoolz.groupby to remove hidden dependency on toolz --- setup.py | 2 +- xarray/core/combine.py | 32 +++++++++++++++++--------------- xarray/tests/test_combine.py | 3 --- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/setup.py b/setup.py index ebc69b18f1c..3b56d9265af 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ 'Topic :: Scientific/Engineering', ] -INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2', 'toolz >= 0.9.0'] +INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2'] TESTS_REQUIRE = ['pytest >= 2.7.1'] if sys.version_info[0] < 3: TESTS_REQUIRE.append('mock') diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 668348ccafe..4ae7b519a84 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,12 +1,11 @@ from __future__ import absolute_import, division, print_function import warnings -from toolz import itertoolz +import itertools from collections import Counter import pandas as pd -from . import utils from .alignment import align from .merge import merge from .pycompat import OrderedDict, basestring, iteritems @@ -373,7 +372,7 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): def _infer_concat_order_from_positions(datasets, concat_dims): - combined_ids = dict(_infer_tile_ids_from_nested_list(datasets, ())) + combined_ids = OrderedDict(_infer_tile_ids_from_nested_list(datasets, ())) tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) @@ -463,9 +462,9 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', # TODO refactor this logic, possibly using method in np.blocks # Perform N-D dimensional concatenation - # Each iteration of this loop reduces the length of the tile_IDs tuples - # by one. It always removes the first - + # Each iteration of this loop reduces the length of the tile_ids tuples + # by one. It always combines along the first dimension, removing the first + # element of the tuple for concat_dim in concat_dims: combined_ids = _auto_combine_all_along_first_dim(combined_ids, dim=concat_dim, @@ -479,13 +478,15 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat): # Group into lines of datasets which must be combined along dim - grouped = itertoolz.groupby(_new_tile_id, combined_ids.items()) + # need to sort by _new_tile_id first for groupby to work + # TODO remove all these sorted OrderedDicts once python >= 3.6 only + combined_ids = OrderedDict(sorted(combined_ids.items(), key=_new_tile_id)) + grouped = itertools.groupby(combined_ids.items(), key=_new_tile_id) new_combined_ids = {} - for new_id, group in grouped.items(): - # TODO is there a way to unpack this object without using OrderedDict? + for new_id, group in grouped: combined_ids = OrderedDict(sorted(group)) - datasets = list(combined_ids.values()) + datasets = combined_ids.values() new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, data_vars, coords) return new_combined_ids @@ -497,11 +498,12 @@ def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, # This is just the old auto_combine function (which only worked along 1D) if concat_dim is not None: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), - datasets).values() - concatenated = [_auto_concat(ds, dim=dim, + grouped = itertools.groupby(datasets, + key=lambda ds: tuple(sorted(ds.data_vars)), + ) + concatenated = [_auto_concat(list(ds_group), dim=dim, data_vars=data_vars, coords=coords) - for ds in grouped] + for id, ds_group in grouped] else: concatenated = datasets merged = merge(concatenated, compat=compat) @@ -536,7 +538,7 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, (datasets, concat_dims) else: # Already sorted so just use the ids already passed - combined_ids = dict(zip(ids, datasets)) + combined_ids = OrderedDict(zip(ids, datasets)) # Check that the inferred shape is combinable _check_shape_tile_ids(combined_ids) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 4225099aa05..1c1313ccf5a 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -485,7 +485,6 @@ def test_uneven_length_input(self): expected = {(0, 0): ds(0), (1, 0): ds(1), (1, 1): ds(2)} actual = dict(_infer_tile_ids_from_nested_list(input, ())) - print(actual) assert_combined_tile_ids_equal(expected, actual) def test_infer_from_datasets(self): @@ -555,8 +554,6 @@ def test_concat_once(self, create_combined_ids, concat_dim): def test_concat_only_first_dim(self, create_combined_ids): shape = (2, 3) combined_ids = create_combined_ids(shape) - ds = create_test_data - print(combined_ids) result = _auto_combine_all_along_first_dim(combined_ids, dim='dim1', data_vars='all', From e6f25a31799b75eaf22a2d05a730aa400736800c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 20:25:35 +0000 Subject: [PATCH 31/96] Fixed erroneous removal of utils import --- xarray/core/combine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 50607392b76..adb9ed32986 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -6,6 +6,7 @@ import pandas as pd +from . import utils from .alignment import align from .merge import merge from .pycompat import OrderedDict, basestring, iteritems @@ -44,7 +45,7 @@ def concat(objs, dim=None, data_vars='all', coords='different', * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. If objects are DataArrays, data_vars must be 'all'. - coords : {'minimal', 'different', 'all' or list of str}, optional + coords : {'minimal', 'different', 'all' o list of str}, optional These coordinate variables will be concatenated together: * 'minimal': Only coordinates in which the dimension already appears are included. From f85648548e325d9a352ffd8ac60261ef33cc2329 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 2 Dec 2018 19:31:22 +0000 Subject: [PATCH 32/96] Updated docstrings to include an example of multidimensional concatenation --- xarray/backends/api.py | 18 ++++++------- xarray/core/combine.py | 58 +++++++++++++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 19 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 417e2c016ee..dd176b33310 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -488,14 +488,18 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, """Open multiple files as a single dataset. Requires dask to be installed. See documentation for details on dask [1]. + Uses ``auto_combine`` to combine the opened datasets - see + ``auto_combine`` for details. Attributes from the first dataset file are used for the combined dataset. Parameters ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit - list of files to open. Paths can be given as strings or as pathlib - Paths. + list of files to open. Paths can be given as strings or as pathlib + Paths. If concatenation along more than one dimension is desired, then + ``paths`` must be a nested list-of-lists (see ``auto_combine`` for + details). chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. @@ -510,8 +514,8 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dim=None`` explicitly to disable - concatenation. + component files. Set ``concat_dims=[..., None, ...]`` explicitly to + disable concatenation along a particular dimension. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for @@ -562,12 +566,6 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition the 'minimal' coordinates. - infer_order_from_coords : bool, optional - If true attempt to deduce the order in which the datasets should be - concatenated from their coordinates. To do this the coordinates should - be monotonic along the dimension to be concatenated. - If false instead read the order from the structure the datasets are - supplied in. This structure should be a nested list of lists. parallel : bool, optional If True, the open and preprocess steps of this function will be performed in parallel using ``dask.delayed``. Default is False. diff --git a/xarray/core/combine.py b/xarray/core/combine.py index adb9ed32986..5d14bf76520 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -561,6 +561,20 @@ def auto_combine(datasets, datasets into a single entity by inspecting metadata and using a combination of concat and merge. + Does not sort data under any circumstances. It does align coordinates, but + different variables on datasets can cause it to fail under some scenarios. + In complex cases, you may need to clean up your data and use concat/merge + explicitly. + + Works well if, for example, you have N years of data and M data variables, + and each combination of a distinct time period and set of data variables is + saved as its own dataset. + + Can concatenate along multiple dimensions. To do this the datasets must be + passed as a nested list-of-lists, with a depth equal to the length of + ``concat_dims``. ``auto_combine`` will concatenate along the top-level list + first. + Parameters ---------- datasets : sequence of xarray.Dataset @@ -572,8 +586,8 @@ def auto_combine(datasets, dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dim=None`` explicitly to disable - concatenation. + component files. Set ``concat_dims=[..., None, ...]`` explicitly to + disable concatenation along a particular dimension. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for @@ -591,21 +605,47 @@ def auto_combine(datasets, Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat - infer_order_from_coords : bool, optional - If true attempt to deduce the order in which the datasets should be - concatenated from their coordinates. To do this the coordinates should - be monotonic along the dimension to be concatenated. - If false instead read the order from the structure the datasets are - supplied in. This structure should be a nested list of lists. Returns ------- combined : xarray.Dataset + Examples + -------- + + Collecting output from a parallel simulation: + + Collecting data from a simulation which decomposes its domain into 4 parts, + 2 each along both the x and y axes, requires organising the datasets into a + nested list, e.g. + + >>> x1y1 + + Dimensions: (x: 2, y: 2) + Coordinates: + lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 + lat (x, y) float64 42.25 42.21 42.63 42.59 + Dimensions without coordinates: x, y + Data variables: + temperature (x, y) float64 11.04 23.57 20.77 ... + precipitation (x, y) float64 5.904 2.453 3.404 ... + + >>> ds_grid = [[x1y1, x1y2], [x2y1, x2y2]] + >>> combined = xr.auto_combine(ds_grid, concat_dims=['x', 'y']) + + Dimensions: (x: 4, y: 4) + Coordinates: + lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 + lat (x, y) float64 42.25 42.21 42.63 42.59 + Dimensions without coordinates: x, y + Data variables: + temperature (x, y) float64 11.04 23.57 20.77 ... + precipitation (x, y) float64 5.904 2.453 3.404 ... + See also -------- concat - Dataset.merge + merge """ # The IDs argument tells _auto_combine that the datasets are not yet sorted From 6305d83988be8d7807d90c580d5e01ce72293012 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 5 Dec 2018 16:26:15 +0000 Subject: [PATCH 33/96] Clarified auto_combine docstring for N-D behaviour --- xarray/core/combine.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 5d14bf76520..19ad58b5224 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -461,7 +461,6 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', combined_ds : xarray.Dataset """ - # TODO refactor this logic, possibly using method in np.blocks # Perform N-D dimensional concatenation # Each iteration of this loop reduces the length of the tile_ids tuples # by one. It always combines along the first dimension, removing the first @@ -557,8 +556,8 @@ def auto_combine(datasets, infer_order_from_coords=False): """Attempt to auto-magically combine the given datasets into one. - This method attempts to combine a list (or nested list of lists) of - datasets into a single entity by inspecting metadata and using a + This method attempts to combine a group of datasets along any number of + dimensions into a single entity by inspecting metadata and using a combination of concat and merge. Does not sort data under any circumstances. It does align coordinates, but @@ -577,8 +576,11 @@ def auto_combine(datasets, Parameters ---------- - datasets : sequence of xarray.Dataset - Dataset objects to merge. + datasets : sequence of xarray.Dataset, or nested list of xarray.Dataset + objects. + Dataset objects to combine. + If concatenation along more than one dimension is desired, then + datasets must be supplied in a nested list-of-lists. concat_dims : list of str, DataArray, Index or None, optional Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. You only need to provide this argument if @@ -588,6 +590,8 @@ def auto_combine(datasets, By default, xarray attempts to infer this argument by examining component files. Set ``concat_dims=[..., None, ...]`` explicitly to disable concatenation along a particular dimension. + Must be the same length as the depth of the list passed to + ``datasets``. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for From ce59da139c678b5d20dedf9367635e7d12c6cd08 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 10 Dec 2018 15:14:01 +0000 Subject: [PATCH 34/96] Added unit test for nested list of Datasets with different variables --- xarray/tests/test_combine.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 1c1313ccf5a..b523ec11e50 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -642,6 +642,24 @@ def test_invalid_hypercube_input(self): with raises_regex(ValueError, 'concat_dims has length'): auto_combine(datasets, concat_dims=['dim1']) + def test_merge_one_dim_concat_another(self): + objs = [[Dataset({'foo': ('x', [0, 1])}), Dataset({'bar': ('x', [10, 20])})], + [Dataset({'foo': ('x', [2, 3])}), Dataset({'bar': ('x', [30, 40])})]] + expected = Dataset({'foo': ('x', [0, 1, 2, 3]), + 'bar': ('x', [10, 20, 30, 40])}) + + actual = auto_combine(objs, concat_dims=['x', None]) + assert_identical(expected, actual) + + actual = auto_combine(objs) + assert_identical(expected, actual) + + # Proving it works symmetrically + objs = [[Dataset({'foo': ('x', [0, 1])}), Dataset({'foo': ('x', [2, 3])})], + [Dataset({'bar': ('x', [10, 20])}), Dataset({'bar': ('x', [30, 40])})]] + actual = auto_combine(objs, concat_dims=[None, 'x']) + assert_identical(expected, actual) + def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] actual = auto_combine(objs, concat_dims=[None, 'x']) From 9fb34cf8e402a02dbf374ca3084d74cf08bf1f71 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 10 Dec 2018 15:33:09 +0000 Subject: [PATCH 35/96] Minor spelling and pep8 fixes --- xarray/backends/api.py | 8 ++++---- xarray/core/combine.py | 12 +++++------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index dd176b33310..1b4935f23b0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -602,10 +602,10 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, # If infer_order_from_coords=True then this is unnecessary, but quick. # If infer_order_from_coords=False then this creates a flat list which is # easier to iterate over, while saving the originally-supplied structure - combined_ids_paths, concat_dims = _infer_concat_order_from_positions\ - (paths, concat_dims) - ids, paths = list(combined_ids_paths.keys()), \ - list(combined_ids_paths.values()) + combined_ids_paths, concat_dims = _infer_concat_order_from_positions( + paths, concat_dims) + ids, paths = ( + list(combined_ids_paths.keys()), list(combined_ids_paths.values())) open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, autoclose=autoclose, **kwargs) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 19ad58b5224..a0e01cda2b7 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -45,7 +45,7 @@ def concat(objs, dim=None, data_vars='all', coords='different', * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. If objects are DataArrays, data_vars must be 'all'. - coords : {'minimal', 'different', 'all' o list of str}, optional + coords : {'minimal', 'different', 'all' or list of str}, optional These coordinate variables will be concatenated together: * 'minimal': Only coordinates in which the dimension already appears are included. @@ -393,7 +393,7 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): Given a list of lists (of lists...) of objects, returns a iterator which returns a tuple containing the index of each object in the nested list structure as the key, and the object. This can then be called by the - dict constructor to create a dictionary of the objects organised byt their + dict constructor to create a dictionary of the objects organised by their position in the original nested list. Recursively traverses the given structure, while keeping track of the @@ -424,14 +424,14 @@ def _check_shape_tile_ids(combined_tile_ids): # Check all tuples are the same length # i.e. check that all lists are nested to the same depth - nesting_depths = [len(id) for id in tile_ids] + nesting_depths = [len(tile_id) for tile_id in tile_ids] if not set(nesting_depths) == {nesting_depths[0]}: raise ValueError("The supplied objects do not form a hypercube because" " sub-lists do not have consistent depths") # Check all lists along one dimension are same length for dim in range(nesting_depths[0]): - indices_along_dim = [id[dim] for id in tile_ids] + indices_along_dim = [tile_id[dim] for tile_id in tile_ids] occurrences = Counter(indices_along_dim) if len(set(occurrences.values())) != 1: raise ValueError("The supplied objects do not form a hypercube " @@ -498,9 +498,7 @@ def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, # This is just the old auto_combine function (which only worked along 1D) if concat_dim is not None: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - grouped = itertools.groupby(datasets, - key=lambda ds: tuple(sorted(ds.data_vars)), - ) + grouped = itertools.groupby(datasets, key=lambda ds: tuple(sorted(ds))) concatenated = [_auto_concat(list(ds_group), dim=dim, data_vars=data_vars, coords=coords) for id, ds_group in grouped] From 83dedb31716ef81cece3e2b659bb65fa3b0de76b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 11 Dec 2018 11:42:00 +0000 Subject: [PATCH 36/96] Started working on a new api with both auto_combine and manual_combine --- xarray/backends/api.py | 31 +++++++--- xarray/core/combine.py | 136 ++++++++++++++++++++++++++++++----------- 2 files changed, 122 insertions(+), 45 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1b4935f23b0..2303773d11a 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -483,10 +483,14 @@ def close(self): def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', - infer_order_from_coords=False, + combine='manual', autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. + If combine='auto' then the function `auto_combine` is used, and if + combine='manual' then `manual_combine` is used, and the filepaths + must be structured accordingly. + Requires dask to be installed. See documentation for details on dask [1]. Uses ``auto_combine`` to combine the opened datasets - see ``auto_combine`` for details. @@ -579,6 +583,7 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, See Also -------- auto_combine + manual_combine open_dataset References @@ -633,15 +638,21 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, # Close datasets in case of a ValueError try: - if infer_order_from_coords: - # Discard ordering because it should be redone from coordinates - ids = False - - combined = _auto_combine(datasets, concat_dims=concat_dims, - compat=compat, - data_vars=data_vars, coords=coords, - infer_order_from_coords=infer_order_from_coords, - ids=ids) + if combine is 'auto': + # Redo ordering from coordinates + raise NotImplementedError + # TODO Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + # Should look like: + combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, + concat_dims) + + # Check that the inferred shape is combinable + _check_shape_tile_ids(combined_ids) + + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords, combine) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index a0e01cda2b7..ba838650502 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -547,47 +547,38 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, return combined -def auto_combine(datasets, - concat_dims=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', - data_vars='all', coords='different', - infer_order_from_coords=False): - """Attempt to auto-magically combine the given datasets into one. - - This method attempts to combine a group of datasets along any number of - dimensions into a single entity by inspecting metadata and using a - combination of concat and merge. +def manual_combine(datasets, concat_dims=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', data_vars='all', coords='different'): + """ + Combine an N-dimensional grid of datasets into one explicitly by using a + succession of concat and merge operations along each dimension of the grid. - Does not sort data under any circumstances. It does align coordinates, but - different variables on datasets can cause it to fail under some scenarios. - In complex cases, you may need to clean up your data and use concat/merge - explicitly. + Does not sort data under any circumstances, so the datsets must be passed + in the order you wish them to be concatenated. It does align coordinates, + but different variables on datasets can cause it to fail under some + scenarios. In complex cases, you may need to clean up your data and use + concat/merge explicitly. - Works well if, for example, you have N years of data and M data variables, - and each combination of a distinct time period and set of data variables is - saved as its own dataset. + To concatenate along multiple dimensions the datasets must be passed as a + nested list-of-lists, with a depth equal to the length of ``concat_dims``. + ``manual_combine`` will concatenate along the top-level list first. - Can concatenate along multiple dimensions. To do this the datasets must be - passed as a nested list-of-lists, with a depth equal to the length of - ``concat_dims``. ``auto_combine`` will concatenate along the top-level list - first. + Useful for combining datasets from a set of nested directories, or for + collecting the output of a simulation parallelized along multiple + dimensions. Parameters ---------- - datasets : sequence of xarray.Dataset, or nested list of xarray.Dataset - objects. + datasets : list or nested list of xarray.Dataset objects. Dataset objects to combine. - If concatenation along more than one dimension is desired, then - datasets must be supplied in a nested list-of-lists. + If concatenation or merging along more than one dimension is desired, + then datasets must be supplied in a nested list-of-lists. concat_dims : list of str, DataArray, Index or None, optional Dimensions along which to concatenate variables, as used by - :py:func:`xarray.concat`. You only need to provide this argument if - any of the dimensions along which you want to concatenate are not a - dimension in the original datasets, e.g., if you want to stack a - collection of 2D arrays along a third dimension. + :py:func:`xarray.concat`. By default, xarray attempts to infer this argument by examining component files. Set ``concat_dims=[..., None, ...]`` explicitly to - disable concatenation along a particular dimension. + disable concatenation and merge instead along a particular dimension. Must be the same length as the depth of the list passed to ``datasets``. compat : {'identical', 'equals', 'broadcast_equals', @@ -650,8 +641,83 @@ def auto_combine(datasets, merge """ - # The IDs argument tells _auto_combine that the datasets are not yet sorted - return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, - data_vars=data_vars, coords=coords, - infer_order_from_coords=infer_order_from_coords, - ids=False) + # The IDs argument tells _combine that the datasets are not yet sorted + return _combine(datasets, concat_dims=concat_dims, compat=compat, + data_vars=data_vars, coords=coords, + infer_order_from_coords=False, ids=False) + + +def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', data_vars='all', coords='different'): + """ + Attempt to auto-magically combine the given datasets into one. + + This method attempts to combine a group of datasets along any number of + dimensions into a single entity by inspecting coords and metadata and using + a combination of concat and merge. + + Will attempt to order the datasets such that their coordinate values are + monotonically increasing along all dimensions. If it cannot determine the + order in which to concatenate the datasets, it will raise an error. + + It does align coordinates, + but different variables on datasets can cause it to fail under some + scenarios. In complex cases, you may need to clean up your data and use + concat/merge explicitly. + + Works well if, for example, you have N years of data and M data variables, + and each combination of a distinct time period and set of data variables is + saved as its own dataset. Also useful for if you have a simulation which is + parallelized in multiple dimensions, but has global coordinates saved in + each file specifying it's position within the domain. + + + Parameters + ---------- + datasets : sequence of xarray.Dataset + Dataset objects to combine. + concat_dim : str, DataArray, Index or None, optional + Dimension along which to concatenate variables, as used by + :py:func:`xarray.concat`. You only need to provide this argument if + the dimension along which you want to concatenate is not a + dimension in the original datasets, e.g., if you want to stack a + collection of 2D arrays along a third dimension. + By default, xarray attempts to infer this argument by examining + component files. Set ``concat_dim=None`` explicitly to + disable concatenation along a particular dimension. + Must be the same length as the depth of the list passed to + ``datasets``. + compat : {'identical', 'equals', 'broadcast_equals', + 'no_conflicts'}, optional + String indicating how to compare variables of the same name for + potential conflicts: + + - 'broadcast_equals': all values must be equal when variables are + broadcast against each other to ensure common dimensions. + - 'equals': all values and dimensions must be the same. + - 'identical': all values, dimensions and attributes must be the + same. + - 'no_conflicts': only values which are not null in both datasets + must be equal. The returned dataset then contains the combination + of all non-null values. + data_vars : {'minimal', 'different', 'all' or list of str}, optional + Details are in the documentation of concat + coords : {'minimal', 'different', 'all' or list of str}, optional + Details are in the documentation of concat + + Returns + ------- + combined : xarray.Dataset + + See also + -------- + concat + merge + """ + if len(concat_dim) > 1: + raise ValueError + + # The IDs argument tells _combine that the datasets are not yet sorted + return _combine(datasets, concat_dims=[concat_dim], compat=compat, + data_vars=data_vars, coords=coords, + infer_order_from_coords=True, ids=False) From 3e64a83d9557c896d7650755d7d471579c568209 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 3 Jan 2019 16:55:17 +0000 Subject: [PATCH 37/96] Wrote basic function to infer concatenation order from coords. Needs better error handling though. --- xarray/backends/api.py | 13 ++-- xarray/core/combine.py | 76 +++++++++++++++++++++++ xarray/tests/test_combine.py | 115 +++++++++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+), 7 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index fcdc92d7307..dd0ab10032f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,7 +10,9 @@ from .. import Dataset, backends, conventions from ..core import indexing -from ..core.combine import _infer_concat_order_from_positions, _auto_combine +from ..core.combine import (_infer_concat_order_from_positions, + _infer_concat_order_from_coords, + _check_shape_tile_ids, _combine_nd) from ..core.pycompat import basestring, path_type from ..core.utils import close_on_error, is_remote_uri, is_grib_path from .common import ArrayWriter @@ -643,19 +645,16 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, try: if combine is 'auto': # Redo ordering from coordinates - raise NotImplementedError - # TODO Use coordinates to determine tile_ID for each dataset in N-D # Ignore how they were ordered previously - # Should look like: - combined_ids, concat_dims = _infer_tile_ids_from_coords( - datasets, concat_dims) + combined_ids, concat_dims = _infer_concat_order_from_coords( + datasets) # Check that the inferred shape is combinable _check_shape_tile_ids(combined_ids) # Repeatedly concatenate then merge along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords, combine) + data_vars=data_vars, coords=coords, combine='auto') except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index df0babd06f1..f00a463ca37 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -6,6 +6,8 @@ import pandas as pd +import numpy as np + from . import utils from .alignment import align from .merge import merge @@ -419,6 +421,80 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): yield current_pos, entry +def _infer_concat_order_from_coords(datasets): + + concat_dims = [] + tile_ids = [() for ds in datasets] + + # All datasets have same variables because they've been grouped as such + ds0 = datasets[0] + for dim in ds0.dims: + + # Check if dim is a coordinate dimension + if dim in ds0: + + # Need to read coordinate values to do ordering + coord_vals = [ds[dim].values for ds in datasets] + + # If dimension coordinate values are same on every dataset then + # should be leaving this dimension alone (it's just a "bystander") + if not _all_arrays_equal(coord_vals): + + # Infer order datasets should be arranged in along this dim + concat_dims.append(dim) + + # TODO generalise this to deduce whether coord should be monotonically increasing or decreasing + if not all(pd.Index(coord).is_monotonic_increasing + for coord in coord_vals): + raise ValueError(f"Coordinate variable {dim} is not " + "monotonically increasing on all " + "datasets") + + # Sort datasets along dim + # Assume that any two datasets whose coord along dim starts with + # the same value have the exact same coord values throughout. + first_coord_vals = [coord[0] for coord in coord_vals] + new_positions = _infer_order_1d(first_coord_vals, + method='dense') + + # TODO check that resulting global coordinate is monotonic + + # Append positions along extra dimension to structure which + # encodes the multi-dimensional concatentation order + tile_ids = [tile_id + (position,) for tile_id, position + in zip(tile_ids, new_positions)] + + # TODO check that this is still the correct logic for case of merging but no concatenation + if len(datasets) > 1 and not concat_dims: + raise ValueError("Could not find any suitable dimension coordinates to" + " use to order the datasets for concatenation") + + combined_ids = OrderedDict(zip(tile_ids, datasets)) + + return combined_ids, concat_dims + + +def _all_arrays_equal(iterator): + try: + iterator = iter(iterator) + first = next(iterator) + return all(np.array_equal(first, rest) for rest in iterator) + except StopIteration: + return True + + +def _infer_order_1d(arr, method='dense'): + # TODO Special cases for string coords - natural sorting instead? + # TODO sort datetime coords too + arr = np.array(arr) + + # We want rank but with identical elements given identical position indices + # - they should be concatenated along another dimension, not along this one + ranks = pd.Series(arr).rank(method=method).values + + return ranks.astype('int') - 1 + + def _check_shape_tile_ids(combined_tile_ids): tile_ids = combined_tile_ids.keys() diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 980e9a7fb6c..89f87b46edf 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -4,6 +4,7 @@ from itertools import product import numpy as np +import numpy.testing as npt import pandas as pd import pytest @@ -12,6 +13,7 @@ from xarray.core.combine import ( _new_tile_id, _auto_combine_all_along_first_dim, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, + _infer_concat_order_from_coords, _infer_order_1d, _all_arrays_equal, _check_shape_tile_ids, _combine_nd, _auto_combine_1d, _auto_combine) from . import ( @@ -500,6 +502,119 @@ def test_infer_from_datasets(self): _infer_concat_order_from_positions(input, ['dim1', 'extra_dim']) +class TestInferOrder1D(object): + def test_arrays(self): + npt.assert_equal(_infer_order_1d([3, 1, 2, 7]), np.array([2, 0, 1, 3])) + npt.assert_equal(_infer_order_1d([5, 7, 8, 8]), np.array([0, 1, 2, 2])) + npt.assert_equal(_infer_order_1d([2, 2, 0]), np.array([0, 0, 1])) + npt.assert_equal(_infer_order_1d([2, 5, 5, 1]), np.array([1, 2, 2, 0])) + + @pytest.mark.xfail + def test_strings(self): + npt.assert_equal(_infer_order_1d(['b', 'a']), np.array([1, 0])) + npt.assert_equal(_infer_order_1d(['aa', 'a']), np.array([1, 0])) + npt.assert_equal(_infer_order_1d(['c1', 'c0']), np.array([1, 0])) + + npt.assert_equal(_infer_order_1d(['c1', 'c0', 'c0']), + np.array([1, 0, 0])) + + # Natural sorting + npt.assert_equal(_infer_order_1d(['c1', 'c0', 'c10']), + np.array([1, 0, 2])) + + @pytest.mark.skip + def test_datetimes(self): + pass + + +def test_all_arrays_equal(): + assert _all_arrays_equal([np.array([1,2,3]), + np.array([1,2,3]), + np.array([1,2,3])]) + assert not _all_arrays_equal([np.array([1, 2, 3]), + np.array([1, 2, 3]), + np.array([1, 2, 4])]) + + +class TestTileIDsFromCoords(object): + def test_1d(self): + ds0 = Dataset({'x': [0, 1]}) + ds1 = Dataset({'x': [2, 3]}) + + expected = {(0,): ds0, (1,): ds1} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['x'] + + def test_2d(self): + ds0 = Dataset({'x': [0, 1], 'y': [10, 20, 30]}) + ds1 = Dataset({'x': [2, 3], 'y': [10, 20, 30]}) + ds2 = Dataset({'x': [0, 1], 'y': [40, 50, 60]}) + ds3 = Dataset({'x': [2, 3], 'y': [40, 50, 60]}) + ds4 = Dataset({'x': [0, 1], 'y': [70, 80, 90]}) + ds5 = Dataset({'x': [2, 3], 'y': [70, 80, 90]}) + + expected = {(0, 0): ds0, (1, 0): ds1, + (0, 1): ds2, (1, 1): ds3, + (0, 2): ds4, (1, 2): ds5} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0, ds3, + ds5, ds2, ds4]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['x', 'y'] + + def test_no_dimension_coords(self): + ds0 = Dataset({'foo': ('x', [0, 1])}) + ds1 = Dataset({'foo': ('x', [2, 3])}) + with raises_regex(ValueError, "Could not find any dimension " + "coordinates"): + _infer_concat_order_from_coords([ds1, ds0]) + + def test_coord_not_monotonic(self): + ds0 = Dataset({'x': [0, 1]}) + ds1 = Dataset({'x': [3, 2]}) + with raises_regex(ValueError, "Coordinate variable x is not " + "monotonically increasing"): + _infer_concat_order_from_coords([ds1, ds0]) + + # TODO raise this error message + @pytest.mark.xfail + def test_check_for_impossible_ordering(self): + ds0 = Dataset({'x': [0, 1, 5]}) + ds1 = Dataset({'x': [2, 3]}) + with raises_regex(ValueError, "Unable to arrange datasets such that " + "coordinate values along dimension x are" + " monotonically increasing"): + _infer_concat_order_from_coords([ds1, ds0]) + + def test_no_concatenation_needed(self): + ds = Dataset({'foo': ('x', [0, 1])}) + expected = {(): ds} + actual, concat_dims = _infer_concat_order_from_coords([ds]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == [] + + def test_2d_plus_bystander_dim(self): + ds0 = Dataset({'x': [0, 1], 'y': [10, 20, 30], 't': [0.1, 0.2]}) + ds1 = Dataset({'x': [2, 3], 'y': [10, 20, 30], 't': [0.1, 0.2]}) + ds2 = Dataset({'x': [0, 1], 'y': [40, 50, 60], 't': [0.1, 0.2]}) + ds3 = Dataset({'x': [2, 3], 'y': [40, 50, 60], 't': [0.1, 0.2]}) + + expected = {(0, 0): ds0, (1, 0): ds1, + (0, 1): ds2, (1, 1): ds3} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0, + ds3, ds2]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['x', 'y'] + + @pytest.mark.skip + def test_string_coord(self): + pass + + @pytest.mark.skip + def test_datetime_coord(self): + pass + + @pytest.fixture(scope='module') def create_combined_ids(): return _create_combined_ids From 963c794bdfa63dd6e8fa220dcd9026b9629dc691 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 4 Jan 2019 12:13:13 +0000 Subject: [PATCH 38/96] Attempt at finalised version of public-facing API. All the internals still need to be redone to match though. --- xarray/__init__.py | 2 +- xarray/backends/api.py | 50 ++++++++++---------- xarray/core/combine.py | 101 ++++++++++++++++++++--------------------- 3 files changed, 77 insertions(+), 76 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 59a961c6b56..07793b6a6fc 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -9,7 +9,7 @@ from .core.alignment import align, broadcast, broadcast_arrays from .core.common import full_like, zeros_like, ones_like -from .core.combine import concat, auto_combine +from .core.combine import concat, auto_combine, manual_combine from .core.computation import apply_ufunc, dot, where from .core.extensions import (register_dataarray_accessor, register_dataset_accessor) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index dd0ab10032f..2e16ee0c07c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -11,8 +11,7 @@ from .. import Dataset, backends, conventions from ..core import indexing from ..core.combine import (_infer_concat_order_from_positions, - _infer_concat_order_from_coords, - _check_shape_tile_ids, _combine_nd) + auto_combine, _manual_combine) from ..core.pycompat import basestring, path_type from ..core.utils import close_on_error, is_remote_uri, is_grib_path from .common import ArrayWriter @@ -482,11 +481,10 @@ def close(self): _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' -def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, +def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', - combine='manual', - autoclose=None, parallel=False, **kwargs): + combine='auto', autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. If combine='auto' then the function `auto_combine` is used to combine the @@ -513,13 +511,13 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details [2]. - concat_dims : list of str, DataArray, Index or None, optional + concat_dim : str, or list of str, DataArray, Index or None, optional Dimensions to concatenate files along. You only need to provide this argument if any of the dimensions along which you want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dims=[..., None, ...]`` explicitly to + component files. Set ``concat_dim=[..., None, ...]`` explicitly to disable concatenation along a particular dimension. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional @@ -557,7 +555,7 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, * 'all': All data variables will be concatenated. * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. - coords : {'minimal', 'different', 'all' o list of str}, optional + coords : {'minimal', 'different', 'all' or list of str}, optional These coordinate variables will be concatenated together: * 'minimal': Only coordinates in which the dimension already appears are included. @@ -573,9 +571,9 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, parallel : bool, optional If True, the open and preprocess steps of this function will be performed in parallel using ``dask.delayed``. Default is False. - combine : {'manual', 'auto'}, optional + combine : {'auto', 'manual'}, optional Whether ``xarray.auto_combine`` or ``xarray.manual_combine`` is used to - combine all the data. Default is 'manual'. + combine all the data. Default is 'auto'. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. @@ -609,11 +607,11 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, raise IOError('no files to open') - # If infer_order_from_coords=True then this is unnecessary, but quick. - # If infer_order_from_coords=False then this creates a flat list which is - # easier to iterate over, while saving the originally-supplied structure + # If combine='auto' then this is unnecessary, but quick. + # If combine='manual' then this creates a flat list which is easier to + # iterate over, while saving the originally-supplied structure as "ids" combined_ids_paths, concat_dims = _infer_concat_order_from_positions( - paths, concat_dims) + paths, concat_dim) ids, paths = ( list(combined_ids_paths.keys()), list(combined_ids_paths.values())) @@ -641,20 +639,24 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, # the underlying datasets will still be stored as dask arrays datasets, file_objs = dask.compute(datasets, file_objs) - # Close datasets in case of a ValueError + # Combine all datasets, closing them in case of a ValueError try: if combine is 'auto': - # Redo ordering from coordinates - # Ignore how they were ordered previously - combined_ids, concat_dims = _infer_concat_order_from_coords( - datasets) + # Will redo ordering from coordinates, ignoring how they were + # ordered previously + if concat_dim is not _CONCAT_DIM_DEFAULT: + raise ValueError("Cannot specify dimensions to concatenate " + "along when auto-combining") - # Check that the inferred shape is combinable - _check_shape_tile_ids(combined_ids) + combined = auto_combine(datasets, compat=compat, + data_vars=data_vars, coords=coords) - # Repeatedly concatenate then merge along each dimension - combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords, combine='auto') + else: + # Combined nested list by successive concat and merge operations + # along each dimension, using structure given by "ids" + combined = _manual_combine(datasets, concat_dims=concat_dim, + compat=compat, data_vars=data_vars, + coords=coords, ids=ids) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index f00a463ca37..67ca209ee37 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -589,41 +589,29 @@ def _new_tile_id(single_id_ds_pair): return tile_id[1:] -def _auto_combine(datasets, concat_dims, compat, data_vars, coords, - infer_order_from_coords, ids): - """ - Calls logic to decide concatenation order before concatenating. - """ +def _manual_combine(datasets, concat_dims, compat, data_vars, coords, ids): # Arrange datasets for concatenation - if infer_order_from_coords: - raise NotImplementedError - # TODO Use coordinates to determine tile_ID for each dataset in N-D - # Ignore how they were ordered previously - # Should look like: - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, - # concat_dims) + # Use information from the shape of the user input + if not ids: + # Determine tile_IDs by structure of input in N-D + # (i.e. ordering in list-of-lists) + combined_ids, concat_dims = _infer_concat_order_from_positions( + datasets, concat_dims) else: - # Use information from the shape of the user input - if not ids: - # Determine tile_IDs by structure of input in N-D - # (i.e. ordering in list-of-lists) - combined_ids, concat_dims = _infer_concat_order_from_positions( - datasets, concat_dims) - else: - # Already sorted so just use the ids already passed - combined_ids = OrderedDict(zip(ids, datasets)) + # Already sorted so just use the ids already passed + combined_ids = OrderedDict(zip(ids, datasets)) # Check that the inferred shape is combinable _check_shape_tile_ids(combined_ids) - # Repeatedly concatenate then merge along each dimension + # Apply series of concatenate or merge operations along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, data_vars=data_vars, coords=coords) return combined -def manual_combine(datasets, concat_dims=_CONCAT_DIM_DEFAULT, +def manual_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', data_vars='all', coords='different'): """ Explicitly combine an N-dimensional grid of datasets into one by using a @@ -649,18 +637,18 @@ def manual_combine(datasets, concat_dims=_CONCAT_DIM_DEFAULT, Dataset objects to combine. If concatenation or merging along more than one dimension is desired, then datasets must be supplied in a nested list-of-lists. - concat_dims : list of str, DataArray, Index or None, optional + concat_dim : str, or list of str, DataArray, Index or None, optional Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dims=[..., None, ...]`` explicitly to + component files. Set ``concat_dim=[..., None, ...]`` explicitly to disable concatenation and merge instead along a particular dimension. Must be the same length as the depth of the list passed to ``datasets``. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for - potential conflicts: + potential merge conflicts: - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. @@ -715,26 +703,33 @@ def manual_combine(datasets, concat_dims=_CONCAT_DIM_DEFAULT, -------- concat merge + auto_combine """ - # The IDs argument tells _combine that the datasets are not yet sorted - return _combine(datasets, concat_dims=concat_dims, compat=compat, - data_vars=data_vars, coords=coords, - infer_order_from_coords=False, ids=False) + if isinstance(concat_dim, str): + concat_dim = [concat_dim] + + # The IDs argument tells _manual_combine that datasets aren't yet sorted + return _manual_combine(datasets, concat_dims=concat_dim, compat=compat, + data_vars=data_vars, coords=coords, ids=False) -def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', data_vars='all', coords='different'): +def auto_combine(datasets, compat='no_conflicts', data_vars='all', + coords='different'): """ - Attempt to auto-magically combine the given datasets into one. + Attempt to auto-magically combine the given datasets into one by using + dimension coordinates. This method attempts to combine a group of datasets along any number of dimensions into a single entity by inspecting coords and metadata and using a combination of concat and merge. - Will attempt to order the datasets such that their coordinate values are - monotonically increasing along all dimensions. If it cannot determine the - order in which to concatenate the datasets, it will raise an error. + Will attempt to order the datasets such that the values in their dimension + coordinates are monotonically increasing along all dimensions. If it cannot + determine the order in which to concatenate the datasets, it will raise an + error. + Non-coordinate dimensions will be ignored, as will any coordinate + dimensions which do not vary between each dataset. Aligns coordinates, but different variables on datasets can cause it to fail under some scenarios. In complex cases, you may need to clean up @@ -750,15 +745,6 @@ def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, ---------- datasets : sequence of xarray.Dataset Dataset objects to combine. - concat_dim : str, DataArray, Index or None, optional - Dimension along which to concatenate variables, as used by - :py:func:`xarray.concat`. You only need to provide this argument if - the dimension along which you want to concatenate is not a - dimension in the original datasets, e.g., if you want to stack a - collection of 2D arrays along a third dimension. - By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dim=None`` explicitly to - disable concatenation. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for @@ -786,10 +772,23 @@ def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, merge manual_combine """ - if len(concat_dim) > 1: - raise ValueError("Informative message") - # The IDs argument tells _combine that the datasets are not yet sorted - return _combine(datasets, concat_dims=[concat_dim], compat=compat, - data_vars=data_vars, coords=coords, - infer_order_from_coords=True, ids=False) + # Group by data vars + grouped = itertools.groupby(datasets, key=lambda ds: tuple(sorted(ds))) + + # Perform the multidimensional combine on each group of data variables + # before merging back together + concatenated_grouped_by_data_vars = [] + for var_group in grouped: + combined_ids, concat_dims = _infer_concat_order_from_coords(list(var_group)) + + # TODO check the shape of the combined ids? + + # Concatenate along all of concat_dims one by one to create single ds + concatenated = _combine_nd(combined_ids, concat_dims=concat_dims, + data_vars=data_vars, coords=coords) + + # TODO check the overall coordinates are monotonically increasing? + concatenated_grouped_by_data_vars.append(concatenated) + + return merge(concatenated_grouped_by_data_vars, compat=compat) From 1a6653082fef998c30138810548317d1260e37ac Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 4 Jan 2019 12:34:41 +0000 Subject: [PATCH 39/96] No longer uses entire old auto_combine internally, only concat or merge --- xarray/core/combine.py | 52 +++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 67ca209ee37..f81ca0005cb 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -518,7 +518,8 @@ def _check_shape_tile_ids(combined_tile_ids): def _combine_nd(combined_ids, concat_dims, data_vars='all', coords='different', compat='no_conflicts'): """ - Concatenates and merges an N-dimensional structure of datasets. + Combines an N-dimensional structure of datasets into one by applying a + series of either concat and merge operations along each dimension. No checks are performed on the consistency of the datasets, concat_dims or tile_IDs, because it is assumed that this has already been done. @@ -530,58 +531,63 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', keys, which specify position within the desired final combined result. concat_dims : sequence of str The dimensions along which the datasets should be concatenated. Must be - in order, and the length must match + in order, and the length must match the length of the tuples used as + keys in combined_ids. If the string is a dimension name then concat + along that dimension, if it is None then merge. Returns ------- combined_ds : xarray.Dataset """ - # Perform N-D dimensional concatenation # Each iteration of this loop reduces the length of the tile_ids tuples # by one. It always combines along the first dimension, removing the first # element of the tuple for concat_dim in concat_dims: - combined_ids = _auto_combine_all_along_first_dim(combined_ids, - dim=concat_dim, - data_vars=data_vars, - coords=coords, - compat=compat) + combined_ids = _combine_all_along_first_dim(combined_ids, + dim=concat_dim, + data_vars=data_vars, + coords=coords, + compat=compat) combined_ds = list(combined_ids.values())[0] return combined_ds -def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, - coords, compat): +def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat): + # Group into lines of datasets which must be combined along dim # need to sort by _new_tile_id first for groupby to work # TODO remove all these sorted OrderedDicts once python >= 3.6 only combined_ids = OrderedDict(sorted(combined_ids.items(), key=_new_tile_id)) grouped = itertools.groupby(combined_ids.items(), key=_new_tile_id) + # Combine all of these datasets along dim new_combined_ids = {} for new_id, group in grouped: combined_ids = OrderedDict(sorted(group)) datasets = combined_ids.values() - new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, - data_vars, coords) + new_combined_ids[new_id] = _combine_1d(datasets, dim, compat, + data_vars, coords) return new_combined_ids -def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', - data_vars='all', coords='different'): - # This is just the old auto_combine function (which only worked along 1D) +def _combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', data_vars='all', coords='different'): + """ + Applies either concat or merge to 1D list of datasets depending on value + of concat_dim + """ + + # TODO this logic is taken from old 1D auto_combine - check if it's right + # Should it just use concat directly instead? if concat_dim is not None: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - grouped = itertools.groupby(datasets, key=lambda ds: tuple(sorted(ds))) - concatenated = [_auto_concat(list(ds_group), dim=dim, - data_vars=data_vars, coords=coords) - for id, ds_group in grouped] + combined = _auto_concat(datasets, dim=dim, data_vars=data_vars, + coords=coords) else: - concatenated = datasets - merged = merge(concatenated, compat=compat) - return merged + combined = merge(datasets, compat=compat) + + return combined def _new_tile_id(single_id_ds_pair): From 7525b23ee6a9fd81f0682f82b160df0e1b462bff Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 4 Jan 2019 13:02:24 +0000 Subject: [PATCH 40/96] Updated what's new --- doc/whats-new.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2671a3d3a29..4b47657152b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,23 @@ Breaking changes Python 3 only. (:issue:`1876`). By `Joe Hamman `_. + +- Combining datasets along N dimensions: + + - ``open_mfdataset`` and ``auto_combine`` can now combine datasets along any + number of dimensions, instead of just a one-dimensional list of datasets. + + If the datasets have monotonic global dimension coordinates then the new + ``auto_combine`` should be used. If not then the new ``manual_combine`` + will accept the datasets as a a nested list-of-lists, and combine by + applying a series of concat and merge operations. + + Breaking because some lists that were previously valid inputs to + ``open_mfdataset`` and ``auto_combine`` may no longer be valid, and should + now be combined explicitly using ``manual_combine`` instead. + (:issue:`2159`) By `Tom Nicholas `_. + + Enhancements ~~~~~~~~~~~~ From 92e120abca9e3791f4377f8cd9f29122bb10557e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 4 Jan 2019 13:05:10 +0000 Subject: [PATCH 41/96] Removed uneeded addition to what's new for old release --- doc/whats-new.rst | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4b47657152b..c4c5bab589a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -87,18 +87,6 @@ This minor release includes a number of enhancements and bug fixes, and two Breaking changes ~~~~~~~~~~~~~~~~ -- Auto-combine along N dimensions: - - - ``open_mfdataset`` and ``auto_combine`` can now combine datasets along any - number of dimensions, instead of just a 1D list of datasets. To combine - along multiple dimensions the datasets must be passed as a nested - list-of-lists. - - Breaking because ``open_mfdataset`` and ``auto_combine`` now expect an - argument ``concat_dims`` instead of ``concat_dim``. ``concat_dims`` accepts - a list of valid ``concat_dim`` arguments, e.g. ``['dim1', 'dim2']``. - (:issue:`2159`) - By `Tom Nicholas `_. - Minimum rasterio version increased from 0.36 to 1.0 (for ``open_rasterio``) - Time bounds variables are now also decoded according to CF conventions (:issue:`2565`). The previous behavior was to decode them only if they From 13a7f75f8e113172cc5b34d385af1f48e9a5bf1e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 4 Jan 2019 13:08:16 +0000 Subject: [PATCH 42/96] Fixed incomplete merge in docstring for open_mfdataset --- xarray/backends/api.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a4da2ae2059..aa10bc61e06 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -493,15 +493,11 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, combine='auto', autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. -<<<<<<< HEAD If combine='auto' then the function `auto_combine` is used to combine the datasets into one before returning the result, and if combine='manual' then `manual_combine` is used. The filepaths must be structured according to which combining function is used, the details of which are given in the documentation for ``auto_combine`` and ``manual_combine``. - -======= ->>>>>>> real_master Requires dask to be installed. See documentation for details on dask [1]. Attributes from the first dataset file are used for the combined dataset. @@ -526,16 +522,10 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining -<<<<<<< HEAD component files. Set ``concat_dim=[..., None, ...]`` explicitly to disable concatenation along a particular dimension. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional -======= - component files. Set ``concat_dim=None`` explicitly to disable - concatenation. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional ->>>>>>> real_master String indicating how to compare variables of the same name for potential conflicts when merging: * 'broadcast_equals': all values must be equal when variables are @@ -562,7 +552,6 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, active dask scheduler. data_vars : {'minimal', 'different', 'all' or list of str}, optional These data variables will be concatenated together: -<<<<<<< HEAD * 'minimal': Only data variables in which the dimension already appears are included. * 'different': Data variables which are not equal (ignoring @@ -574,22 +563,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. coords : {'minimal', 'different', 'all' or list of str}, optional -======= - - * 'minimal': Only data variables in which the dimension already - appears are included. - * 'different': Data variables which are not equal (ignoring - attributes) across all datasets are also concatenated (as well as - all for which dimension already appears). Beware: this option may - load the data payload of data variables into memory if they are not - already loaded. - * 'all': All data variables will be concatenated. - * list of str: The listed data variables will be concatenated, in - addition to the 'minimal' data variables. - coords : {'minimal', 'different', 'all' o list of str}, optional ->>>>>>> real_master These coordinate variables will be concatenated together: - * 'minimal': Only coordinates in which the dimension already appears are included. * 'different': Coordinates which are not equal (ignoring attributes) From b76e681b206303a7682b5a6b330ebb69f7749315 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 6 Jan 2019 21:35:28 +0000 Subject: [PATCH 43/96] Tests for manual combine passing --- xarray/core/combine.py | 15 +- xarray/tests/test_combine.py | 361 +++++++++++++++++++---------------- 2 files changed, 209 insertions(+), 167 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 95160db1c8d..5bafed039ff 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -370,6 +370,7 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) +# TODO make sure this gets changed to match #2648 _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' @@ -379,8 +380,12 @@ def _infer_concat_order_from_positions(datasets, concat_dims): tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims is None: - concat_dims = [concat_dims] * n_dims + + + # TODO concat_dims will never be None, it will be [None] instead + + if concat_dims is _CONCAT_DIM_DEFAULT: + concat_dims = [_CONCAT_DIM_DEFAULT] * n_dims else: if len(concat_dims) != n_dims: raise ValueError("concat_dims has length " + str(len(concat_dims)) @@ -466,8 +471,8 @@ def _infer_concat_order_from_coords(datasets): # TODO check that this is still the correct logic for case of merging but no concatenation if len(datasets) > 1 and not concat_dims: - raise ValueError("Could not find any suitable dimension coordinates to" - " use to order the datasets for concatenation") + raise ValueError("Could not find any dimension coordinates to use to " + "order the datasets for concatenation") combined_ids = OrderedDict(zip(tile_ids, datasets)) @@ -712,7 +717,7 @@ def manual_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, auto_combine """ - if isinstance(concat_dim, str): + if isinstance(concat_dim, str) or concat_dim is None: concat_dim = [concat_dim] # The IDs argument tells _manual_combine that datasets aren't yet sorted diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index c3f4ec27960..31165e41b36 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -8,12 +8,13 @@ import pandas as pd import pytest -from xarray import DataArray, Dataset, Variable, auto_combine, concat +from xarray import (DataArray, Dataset, Variable, concat, auto_combine, + manual_combine) from xarray.core.combine import ( - _new_tile_id, _combine_all_along_first_dim, _combine_nd, _combine_1d, + _new_tile_id, _check_shape_tile_ids, + _combine_all_along_first_dim, _combine_nd, _all_arrays_equal, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, - _infer_concat_order_from_coords, _infer_order_1d, _all_arrays_equal, - _check_shape_tile_ids, auto_combine, manual_combine) + _infer_concat_order_from_coords, _infer_order_1d) from xarray.core.pycompat import OrderedDict, iteritems from . import ( @@ -302,116 +303,6 @@ def test_concat_lazy(self): assert combined.dims == ('z', 'x', 'y') -class TestAutoCombine(object): - @pytest.mark.parametrize("combine", [_auto_combine_1d, auto_combine]) - @requires_dask # only for toolz - def test_auto_combine(self, combine): - objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] - actual = combine(objs) - expected = Dataset({'x': [0, 1]}) - assert_identical(expected, actual) - - actual = combine([actual]) - assert_identical(expected, actual) - - objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] - actual = combine(objs) - expected = Dataset({'x': [0, 1, 2]}) - assert_identical(expected, actual) - - # ensure auto_combine handles non-sorted variables - objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), - Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] - actual = combine(objs) - expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) - assert_identical(expected, actual) - - objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] - with raises_regex(ValueError, 'too many .* dimensions'): - combine(objs) - - objs = [Dataset({'x': 0}), Dataset({'x': 1})] - with raises_regex(ValueError, 'cannot infer dimension'): - combine(objs) - - objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] - with pytest.raises(KeyError): - combine(objs) - - @requires_dask # only for toolz - def test_auto_combine_previously_failed(self): - # In the above scenario, one file is missing, containing the data for - # one year's data for one variable. - datasets = [Dataset({'a': ('x', [0]), 'x': [0]}), - Dataset({'b': ('x', [0]), 'x': [0]}), - Dataset({'a': ('x', [1]), 'x': [1]})] - expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}, - {'x': [0, 1]}) - actual = auto_combine(datasets) - assert_identical(expected, actual) - - # Your data includes "time" and "station" dimensions, and each year's - # data has a different set of stations. - datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), - Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] - expected = Dataset({'a': (('t', 'x'), - [[np.nan, 2, 3], [1, 2, np.nan]])}, - {'x': [0, 1, 2]}) - actual = auto_combine(datasets, concat_dims=['t']) - assert_identical(expected, actual) - - @requires_dask # only for toolz - def test_auto_combine_still_fails(self): - # concat can't handle new variables (yet): - # https://github.com/pydata/xarray/issues/508 - datasets = [Dataset({'x': 0}, {'y': 0}), - Dataset({'x': 1}, {'y': 1, 'z': 1})] - with pytest.raises(ValueError): - auto_combine(datasets, 'y') - - @requires_dask # only for toolz - def test_auto_combine_no_concat(self): - objs = [Dataset({'x': 0}), Dataset({'y': 1})] - actual = auto_combine(objs) - expected = Dataset({'x': 0, 'y': 1}) - assert_identical(expected, actual) - - objs = [Dataset({'x': 0, 'y': 1}), Dataset({'y': np.nan, 'z': 2})] - actual = auto_combine(objs) - expected = Dataset({'x': 0, 'y': 1, 'z': 2}) - assert_identical(expected, actual) - - data = Dataset({'x': 0}) - actual = auto_combine([data, data, data], concat_dims=None) - assert_identical(data, actual) - - tmp1 = Dataset({'x': 0}) - tmp2 = Dataset({'x': np.nan}) - actual = auto_combine([tmp1, tmp2], concat_dims=None) - assert_identical(tmp1, actual) - actual = auto_combine([tmp1, tmp2], concat_dims=[None]) - assert_identical(tmp1, actual) - - # Single object, with a concat_dim explicitly provided - # Test the issue reported in GH #1988 - objs = [Dataset({'x': 0, 'y': 1})] - dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dims=[dim]) - expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, - {'baz': [100]}) - assert_identical(expected, actual) - - # Just making sure that auto_combine is doing what is - # expected for non-scalar values, too. - objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] - dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dims=[dim]) - expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), - 'y': (('baz', 'z'), [[1, 2]])}, - {'baz': [100]}) - assert_identical(expected, actual) - - class TestTileIDsFromNestedList(object): def test_1d(self): ds = create_test_data @@ -506,7 +397,7 @@ class TestInferOrder1D(object): def test_arrays(self): npt.assert_equal(_infer_order_1d([3, 1, 2, 7]), np.array([2, 0, 1, 3])) npt.assert_equal(_infer_order_1d([5, 7, 8, 8]), np.array([0, 1, 2, 2])) - npt.assert_equal(_infer_order_1d([2, 2, 0]), np.array([0, 0, 1])) + npt.assert_equal(_infer_order_1d([2, 2, 0]), np.array([1, 1, 0])) npt.assert_equal(_infer_order_1d([2, 5, 5, 1]), np.array([1, 2, 2, 0])) @pytest.mark.xfail @@ -565,8 +456,7 @@ def test_2d(self): def test_no_dimension_coords(self): ds0 = Dataset({'foo': ('x', [0, 1])}) ds1 = Dataset({'foo': ('x', [2, 3])}) - with raises_regex(ValueError, "Could not find any dimension " - "coordinates"): + with raises_regex(ValueError, "Could not find any dimension"): _infer_concat_order_from_coords([ds1, ds0]) def test_coord_not_monotonic(self): @@ -576,7 +466,7 @@ def test_coord_not_monotonic(self): "monotonically increasing"): _infer_concat_order_from_coords([ds1, ds0]) - # TODO raise this error message + # TODO implement this error message @pytest.mark.xfail def test_check_for_impossible_ordering(self): ds0 = Dataset({'x': [0, 1, 5]}) @@ -632,8 +522,7 @@ def _create_tile_ids(shape): return list(tile_ids) -@requires_dask # only for toolz -class TestCombineND(object): +class TestNewTileIDs(object): @pytest.mark.parametrize("old_id, new_id", [((3, 0, 1), (0, 1)), ((0, 0), (0,)), ((1,), ()), @@ -651,16 +540,17 @@ def test_get_new_tile_ids(self, create_combined_ids): actual_tile_ids = _create_tile_ids(shape) assert expected_tile_ids == actual_tile_ids + +class TestCombineND(object): @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) combined_ids = create_combined_ids(shape) ds = create_test_data - result = _auto_combine_all_along_first_dim(combined_ids, - dim=concat_dim, - data_vars='all', - coords='different', - compat='no_conflicts') + result = _combine_all_along_first_dim(combined_ids, dim=concat_dim, + data_vars='all', + coords='different', + compat='no_conflicts') expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) @@ -668,7 +558,7 @@ def test_concat_once(self, create_combined_ids, concat_dim): def test_concat_only_first_dim(self, create_combined_ids): shape = (2, 3) combined_ids = create_combined_ids(shape) - result = _auto_combine_all_along_first_dim(combined_ids, + result = _combine_all_along_first_dim(combined_ids, dim='dim1', data_vars='all', coords='different', @@ -715,17 +605,113 @@ def test_check_lengths(self): _check_shape_tile_ids(combined_tile_ids) -@requires_dask # only for toolz -class TestAutoCombineND(object): - def test_single_dataset(self): +class TestManualCombine(object): + def test_manual_concat(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] - actual = auto_combine(objs) expected = Dataset({'x': [0, 1]}) + actual = manual_combine(objs, concat_dim='x') + assert_identical(expected, actual) + actual = manual_combine(objs, concat_dim=['x']) assert_identical(expected, actual) - actual = auto_combine(actual) + actual = manual_combine([actual], concat_dim=None) assert_identical(expected, actual) + objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] + actual = manual_combine(objs, concat_dim='x') + expected = Dataset({'x': [0, 1, 2]}) + assert_identical(expected, actual) + + # ensure auto_combine handles non-sorted variables + objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), + Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] + actual = manual_combine(objs, concat_dim='a') + expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) + assert_identical(expected, actual) + + # objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] + # with raises_regex(ValueError, 'too many .* dimensions'): + # auto_combine(objs) + + # objs = [Dataset({'x': 0}), Dataset({'x': 1})] + # with raises_regex(ValueError, 'cannot infer dimension'): + # auto_combine(objs) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] + with pytest.raises(KeyError): + manual_combine(objs, concat_dim='x') + + def test_manual_concat_along_new_dim(self): + objs = [Dataset({'a': ('x', [10]), 'x': [0]}), + Dataset({'a': ('x', [20]), 'x': [0]})] + expected = Dataset({'a': (('t', 'x'), [[10], [20]]), 'x': [0]}) + actual = manual_combine(objs, concat_dim='t') + assert_identical(expected, actual) + + # TODO same thing but with a DataArray as the new dim + + def test_manual_merge(self): + data = Dataset({'x': 0}) + actual = manual_combine([data, data, data], concat_dim=None) + assert_identical(data, actual) + + ds1 = Dataset({'a': ('x', [1, 2]), 'x': [0, 1]}) + ds2 = Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}) + expected = Dataset({'a': ('x', [1, 2, 3]), 'x': [0, 1, 2]}) + actual = manual_combine([ds1, ds2], concat_dim=None) + assert_identical(expected, actual) + actual = manual_combine([ds1, ds2], concat_dim=[None]) + assert_identical(expected, actual) + + tmp1 = Dataset({'x': 0}) + tmp2 = Dataset({'x': np.nan}) + actual = manual_combine([tmp1, tmp2], concat_dim=None) + assert_identical(tmp1, actual) + actual = manual_combine([tmp1, tmp2], concat_dim=[None]) + assert_identical(tmp1, actual) + + # Single object, with a concat_dim explicitly provided + # Test the issue reported in GH #1988 + objs = [Dataset({'x': 0, 'y': 1})] + dim = DataArray([100], name='baz', dims='baz') + actual = manual_combine(objs, concat_dim=[dim]) + expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, + {'baz': [100]}) + assert_identical(expected, actual) + + # Just making sure that auto_combine is doing what is + # expected for non-scalar values, too. + objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] + dim = DataArray([100], name='baz', dims='baz') + actual = manual_combine(objs, concat_dim=[dim]) + expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), + 'y': (('baz', 'z'), [[1, 2]])}, + {'baz': [100]}) + assert_identical(expected, actual) + + def test_concat_multiple_dims(self): + objs = [[Dataset({'a': (('x', 'y'), [[0]])}), + Dataset({'a': (('x', 'y'), [[1]])})], + [Dataset({'a': (('x', 'y'), [[2]])}), + Dataset({'a': (('x', 'y'), [[3]])})]] + actual = manual_combine(objs, concat_dim=['x', 'y']) + expected = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]])}) + assert_identical(expected, actual) + + def test_concat_one_dim_merge_another(self): + data = create_test_data() + data1 = data.copy(deep=True) + data2 = data.copy(deep=True) + + objs = [[data1.var1.isel(dim2=slice(4)), + data2.var1.isel(dim2=slice(4, 9))], + [data1.var2.isel(dim2=slice(4)), + data2.var2.isel(dim2=slice(4, 9))]] + + expected = data[['var1', 'var2']] + actual = manual_combine(objs, concat_dim=[None, 'dim2']) + assert expected.identical(actual) + def test_auto_combine_2d(self): ds = create_test_data @@ -735,25 +721,36 @@ def test_auto_combine_2d(self): expected = concat([partway1, partway2, partway3], dim='dim2') datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] - result = auto_combine(datasets, concat_dims=['dim1', 'dim2']) + result = manual_combine(datasets, concat_dim=['dim1', 'dim2']) assert_equal(result, expected) + def test_manual_combine_missing_data_new_dim(self): + # Your data includes "time" and "station" dimensions, and each year's + # data has a different set of stations. + datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), + Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] + expected = Dataset({'a': (('t', 'x'), + [[np.nan, 2, 3], [1, 2, np.nan]])}, + {'x': [0, 1, 2]}) + actual = manual_combine(datasets, concat_dim='t') + assert_identical(expected, actual) + def test_invalid_hypercube_input(self): ds = create_test_data datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4)]] with raises_regex(ValueError, 'sub-lists do not have ' 'consistent lengths'): - auto_combine(datasets, concat_dims=['dim1', 'dim2']) + manual_combine(datasets, concat_dim=['dim1', 'dim2']) datasets = [[ds(0), ds(1)], [[ds(3), ds(4)]]] with raises_regex(ValueError, 'sub-lists do not have ' 'consistent depths'): - auto_combine(datasets, concat_dims=['dim1', 'dim2']) + manual_combine(datasets, concat_dim=['dim1', 'dim2']) datasets = [[ds(0), ds(1)], [ds(3), ds(4)]] with raises_regex(ValueError, 'concat_dims has length'): - auto_combine(datasets, concat_dims=['dim1']) + manual_combine(datasets, concat_dim=['dim1']) def test_merge_one_dim_concat_another(self): objs = [[Dataset({'foo': ('x', [0, 1])}), @@ -763,10 +760,7 @@ def test_merge_one_dim_concat_another(self): expected = Dataset({'foo': ('x', [0, 1, 2, 3]), 'bar': ('x', [10, 20, 30, 40])}) - actual = auto_combine(objs, concat_dims=['x', None]) - assert_identical(expected, actual) - - actual = auto_combine(objs) + actual = manual_combine(objs, concat_dim=['x', None]) assert_identical(expected, actual) # Proving it works symmetrically @@ -774,51 +768,94 @@ def test_merge_one_dim_concat_another(self): Dataset({'foo': ('x', [2, 3])})], [Dataset({'bar': ('x', [10, 20])}), Dataset({'bar': ('x', [30, 40])})]] - actual = auto_combine(objs, concat_dim=[None, 'x']) + actual = manual_combine(objs, concat_dim=[None, 'x']) assert_identical(expected, actual) def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] - actual = auto_combine(objs, concat_dims=[None, 'x']) + actual = manual_combine(objs, concat_dim=[None, 'x']) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({'x': [0]})], [Dataset({'x': [1]})]] - actual = auto_combine(objs, concat_dims=['x', None]) + actual = manual_combine(objs, concat_dim=['x', None]) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({'x': [0]})]] - actual = auto_combine(objs, concat_dims=[None, None]) + actual = manual_combine(objs, concat_dim=[None, None]) expected = Dataset({'x': [0]}) assert_identical(expected, actual) - objs = [[Dataset({'x': [0]})]] - actual = auto_combine(objs, concat_dims=None) - expected = Dataset({'x': [0]}) + +class TestAutoCombine(object): + def test_auto_combine(self): + objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) + actual = auto_combine([actual]) + assert_identical(expected, actual) -class TestAutoCombineUsingCoords(object): - def test_infer_order_from_coords_not_implemented(self): - data = create_test_data() - objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - with pytest.raises(NotImplementedError): - auto_combine(objs, infer_order_from_coords=True) + objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1, 2]}) + assert_identical(expected, actual) - def test_order_inferred_from_coords(self): - data = create_test_data() - objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - with pytest.raises(NotImplementedError): - _auto_combine(objs, concat_dims=['dim2'],compat='no_conflicts', - data_vars='all', coords='different', - infer_order_from_coords=True, ids=True) + # ensure auto_combine handles non-sorted variables + objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), + Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] + actual = auto_combine(objs) + expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) + assert_identical(expected, actual) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] + with raises_regex(ValueError, 'too many .* dimensions'): + auto_combine(objs) + + objs = [Dataset({'x': 0}), Dataset({'x': 1})] + with raises_regex(ValueError, 'cannot infer dimension'): + auto_combine(objs) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] + with pytest.raises(KeyError): + auto_combine(objs) - @pytest.mark.xfail(reason="Not yet implemented") def test_infer_order_from_coords(self): - # Should pass once inferring order from coords is implemented data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - actual = auto_combine(objs) # but with infer_order_from_coords=True + actual = auto_combine(objs) expected = data assert_identical(expected, actual) + + def test_auto_combine_previously_failed(self): + # In the above scenario, one file is missing, containing the data for + # one year's data for one variable. + datasets = [Dataset({'a': ('x', [0]), 'x': [0]}), + Dataset({'b': ('x', [0]), 'x': [0]}), + Dataset({'a': ('x', [1]), 'x': [1]})] + expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}, + {'x': [0, 1]}) + actual = auto_combine(datasets) + assert_identical(expected, actual) + + def test_auto_combine_still_fails(self): + # concat can't handle new variables (yet): + # https://github.com/pydata/xarray/issues/508 + datasets = [Dataset({'x': 0}, {'y': 0}), + Dataset({'x': 1}, {'y': 1, 'z': 1})] + with pytest.raises(ValueError): + auto_combine(datasets, 'y') + + def test_auto_combine_no_concat(self): + objs = [Dataset({'x': 0}), Dataset({'y': 1})] + actual = auto_combine(objs) + expected = Dataset({'x': 0, 'y': 1}) + assert_identical(expected, actual) + + objs = [Dataset({'x': 0, 'y': 1}), Dataset({'y': np.nan, 'z': 2})] + actual = auto_combine(objs) + expected = Dataset({'x': 0, 'y': 1, 'z': 2}) + assert_identical(expected, actual) + From c09df8b69fdad94783f842c587117be2dabb1f14 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 6 Jan 2019 22:03:03 +0000 Subject: [PATCH 44/96] Tests for auto_combine now passing --- xarray/core/combine.py | 8 +++----- xarray/tests/test_combine.py | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 5bafed039ff..9310d74aead 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -381,9 +381,6 @@ def _infer_concat_order_from_positions(datasets, concat_dims): tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - - # TODO concat_dims will never be None, it will be [None] instead - if concat_dims is _CONCAT_DIM_DEFAULT: concat_dims = [_CONCAT_DIM_DEFAULT] * n_dims else: @@ -791,8 +788,9 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', # Perform the multidimensional combine on each group of data variables # before merging back together concatenated_grouped_by_data_vars = [] - for var_group in grouped: - combined_ids, concat_dims = _infer_concat_order_from_coords(list(var_group)) + for vars, datasets in grouped: + combined_ids, concat_dims = _infer_concat_order_from_coords( + list(datasets)) # TODO check the shape of the combined ids? diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 31165e41b36..06a8aec90a6 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -622,13 +622,14 @@ def test_manual_concat(self): expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) - # ensure auto_combine handles non-sorted variables + # ensure manual_combine handles non-sorted variables objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] actual = manual_combine(objs, concat_dim='a') expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) assert_identical(expected, actual) + # TODO check these errors get raised properly # objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] # with raises_regex(ValueError, 'too many .* dimensions'): # auto_combine(objs) @@ -804,18 +805,21 @@ def test_auto_combine(self): assert_identical(expected, actual) # ensure auto_combine handles non-sorted variables - objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), - Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] + objs = [Dataset({'x': ('a', [0]), 'y': ('a', [0]), 'a': [0]}), + Dataset({'x': ('a', [1]), 'y': ('a', [1]), 'a': [1]})] actual = auto_combine(objs) - expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) + expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1]), 'a': [0, 1]}) assert_identical(expected, actual) + # TODO check this is the desired behaviour objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] - with raises_regex(ValueError, 'too many .* dimensions'): - auto_combine(objs) + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1], 'y': [0, 1]}) + assert_equal(actual, expected) objs = [Dataset({'x': 0}), Dataset({'x': 1})] - with raises_regex(ValueError, 'cannot infer dimension'): + with raises_regex(ValueError, 'Could not find any dimension ' + 'coordinates'): auto_combine(objs) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] @@ -827,7 +831,7 @@ def test_infer_order_from_coords(self): objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] actual = auto_combine(objs) expected = data - assert_identical(expected, actual) + assert expected.broadcast_equals(actual) def test_auto_combine_previously_failed(self): # In the above scenario, one file is missing, containing the data for From 953d572f3526699b3c2f333964bd5019710d99b2 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 6 Jan 2019 22:19:49 +0000 Subject: [PATCH 45/96] xfailed weird behaviour with manual_combine trying to determine concat_dim --- xarray/core/combine.py | 8 +++++--- xarray/tests/test_combine.py | 21 +++++++++++---------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 9310d74aead..8ef263a98b9 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -714,8 +714,9 @@ def manual_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, auto_combine """ - if isinstance(concat_dim, str) or concat_dim is None: - concat_dim = [concat_dim] + if concat_dim is not _CONCAT_DIM_DEFAULT: + if isinstance(concat_dim, str) or concat_dim is None: + concat_dim = [concat_dim] # The IDs argument tells _manual_combine that datasets aren't yet sorted return _manual_combine(datasets, concat_dims=concat_dim, compat=compat, @@ -792,7 +793,8 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', combined_ids, concat_dims = _infer_concat_order_from_coords( list(datasets)) - # TODO check the shape of the combined ids? + # TODO checking the shape of the combined ids appropriate here? + _check_shape_tile_ids(combined_ids) # Concatenate along all of concat_dims one by one to create single ds concatenated = _combine_nd(combined_ids, concat_dims=concat_dims, diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 06a8aec90a6..005d996c579 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -629,19 +629,21 @@ def test_manual_concat(self): expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) assert_identical(expected, actual) - # TODO check these errors get raised properly - # objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] - # with raises_regex(ValueError, 'too many .* dimensions'): - # auto_combine(objs) - - # objs = [Dataset({'x': 0}), Dataset({'x': 1})] - # with raises_regex(ValueError, 'cannot infer dimension'): - # auto_combine(objs) - objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] with pytest.raises(KeyError): manual_combine(objs, concat_dim='x') + # TODO weird error from auto_concat on both of these when it tries to infer dimension? + @pytest.mark.xfail + def test_manual_concat_too_many_dims_at_once(self): + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] + with raises_regex(ValueError, 'too many .* dimensions'): + manual_combine(objs) + + objs = [Dataset({'x': 0}), Dataset({'x': 1})] + with raises_regex(ValueError, 'cannot infer dimension'): + manual_combine(objs) + def test_manual_concat_along_new_dim(self): objs = [Dataset({'a': ('x', [10]), 'x': [0]}), Dataset({'a': ('x', [20]), 'x': [0]})] @@ -862,4 +864,3 @@ def test_auto_combine_no_concat(self): actual = auto_combine(objs) expected = Dataset({'x': 0, 'y': 1, 'z': 2}) assert_identical(expected, actual) - From b7bf1ad0dc2d3a0817182df583776b265b071826 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 6 Jan 2019 22:26:07 +0000 Subject: [PATCH 46/96] Add auto_combine and manual_combine to API page of docs --- doc/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 9a00630f88e..cc4c300c4a3 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -19,6 +19,8 @@ Top-level functions broadcast concat merge + auto_combine + manual_combine where set_options full_like From 855d819fec558c0956d584dd80da1d6bc85cd0ef Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 6 Jan 2019 23:07:33 +0000 Subject: [PATCH 47/96] Tests now passing for open_mfdataset --- xarray/backends/api.py | 10 ++++-- xarray/tests/test_backends.py | 63 ++++++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index aa10bc61e06..ba5ea3f0b61 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,8 +10,9 @@ from .. import Dataset, backends, conventions from ..core import indexing -from ..core.combine import (_infer_concat_order_from_positions, - auto_combine, _manual_combine) +from xarray import auto_combine +from ..core.combine import (_manual_combine, + _infer_concat_order_from_positions) from ..core.pycompat import basestring, path_type from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import ArrayWriter @@ -613,10 +614,13 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') - # If combine='auto' then this is unnecessary, but quick. # If combine='manual' then this creates a flat list which is easier to # iterate over, while saving the originally-supplied structure as "ids" + if combine is 'manual': + if concat_dim is not _CONCAT_DIM_DEFAULT: + if isinstance(concat_dim, str) or concat_dim is None: + concat_dim = [concat_dim] combined_ids_paths, concat_dims = _infer_concat_order_from_positions( paths, concat_dim) ids, paths = ( diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 28d0b331e58..0940c65cb7f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1987,7 +1987,8 @@ def test_open_mfdataset_manyfiles(readengine, nfiles, parallel, chunks, subds.to_netcdf(tmpfiles[ii], engine=writeengine) # check that calculation on opened datasets works properly - with open_mfdataset(tmpfiles, engine=readengine, parallel=parallel, + with open_mfdataset(tmpfiles, combine='manual', concat_dim='x', + engine=readengine, parallel=parallel, chunks=chunks) as actual: # check that using open_mfdataset returns dask arrays for variables @@ -2163,11 +2164,13 @@ def test_open_mfdataset(self): with create_tmp_file() as tmp2: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x') as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) assert_identical(original, actual) - with open_mfdataset([tmp1, tmp2], chunks={'x': 3}) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x', chunks={'x': 3}) as actual: assert actual.foo.variable.data.chunks == ((3, 2, 3, 2),) with raises_regex(IOError, 'no files to open'): @@ -2192,7 +2195,8 @@ def test_open_mfdataset_2d(self): y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset([[tmp1, tmp2], [tmp3, tmp4]], - concat_dims=['y', 'x']) as actual: + combine='manual', + concat_dim=['y', 'x']) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == \ @@ -2200,7 +2204,8 @@ def test_open_mfdataset_2d(self): assert_identical(original, actual) with open_mfdataset([[tmp1, tmp2], [tmp3, tmp4]], - concat_dims=['y', 'x'], + combine='manual', + concat_dim=['y', 'x'], chunks={'x': 3, 'y': 2}) as actual: assert actual.foo.variable.data.chunks == \ ((3, 2, 3, 2), (2, 2, 2, 2),) @@ -2214,7 +2219,8 @@ def test_open_mfdataset_pathlib(self): tmp2 = Path(tmp2) original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x') as actual: assert_identical(original, actual) @requires_pathlib @@ -2238,10 +2244,10 @@ def test_open_mfdataset_2d_pathlib(self): y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset([[tmp1, tmp2], [tmp3, tmp4]], - concat_dims=['y', 'x']) as actual: + combine='manual', + concat_dim=['y', 'x']) as actual: assert_identical(original, actual) - @pytest.mark.xfail(reason="Not yet implemented") def test_open_mfdataset_2(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp1: @@ -2249,11 +2255,8 @@ def test_open_mfdataset_2(self): original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with pytest.raises(NotImplementedError): - open_mfdataset([tmp1, tmp2], infer_order_from_coords=True) - - # With infer_order_from_coords=True this should pass in future - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x') as actual: assert_identical(original, actual) def test_attrs_mfdataset(self): @@ -2266,7 +2269,8 @@ def test_attrs_mfdataset(self): ds2.attrs['test2'] = 'bar' ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x') as actual: # presumes that attributes inherited from # first dataset loaded assert actual.test1 == ds1.test1 @@ -2275,6 +2279,22 @@ def test_attrs_mfdataset(self): 'no attribute'): actual.test2 + def test_open_mfdataset_auto_combine(self): + original = Dataset({'foo': ('x', np.random.randn(10)), + 'x': np.arange(10)}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + original.isel(x=slice(5)).to_netcdf(tmp1) + original.isel(x=slice(5, 10)).to_netcdf(tmp2) + + with open_mfdataset([tmp1, tmp2]) as actual: + assert_identical(original, actual) + + with raises_regex(ValueError, "Cannot specify dimensions to " + "concatenate along when " + "auto-combining"): + open_mfdataset([tmp1, tmp2], concat_dim='x') + def test_preprocess_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: @@ -2294,7 +2314,8 @@ def test_save_mfdataset_roundtrip(self): with create_tmp_file() as tmp1: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x') as actual: assert_identical(actual, original) def test_save_mfdataset_invalid(self): @@ -2320,7 +2341,8 @@ def test_save_mfdataset_pathlib_roundtrip(self): tmp1 = Path(tmp1) tmp2 = Path(tmp2) save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x') as actual: assert_identical(actual, original) def test_open_and_do_math(self): @@ -2337,7 +2359,8 @@ def test_open_mfdataset_concat_dim_none(self): data = Dataset({'x': 0}) data.to_netcdf(tmp1) Dataset({'x': np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dims=None) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim=None) as actual: assert_identical(data, actual) def test_open_dataset(self): @@ -2364,7 +2387,8 @@ def test_open_single_dataset(self): {'baz': [100]}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset([tmp], concat_dims=[dim]) as actual: + with open_mfdataset([tmp], combine='manual', + concat_dim=[dim]) as actual: assert_identical(expected, actual) def test_dask_roundtrip(self): @@ -2416,7 +2440,8 @@ def test_save_mfdataset_compute_false_roundtrip(self): engine=self.engine, compute=False) assert isinstance(delayed_obj, Delayed) delayed_obj.compute() - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x') as actual: assert_identical(actual, original) From bfcb4e3bc2cebed0871e610257c3ffbdc0313e67 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 7 Jan 2019 00:36:12 +0000 Subject: [PATCH 48/96] Completed merge so that #2648 is respected, and added tests. Also moved concat to it's own file to avoid a circular dependency --- xarray/__init__.py | 3 +- xarray/backends/api.py | 14 +- xarray/core/combine.py | 365 +-------------------------------- xarray/core/concat.py | 366 ++++++++++++++++++++++++++++++++++ xarray/core/groupby.py | 2 +- xarray/tests/test_backends.py | 3 +- xarray/tests/test_combine.py | 7 +- 7 files changed, 387 insertions(+), 373 deletions(-) create mode 100644 xarray/core/concat.py diff --git a/xarray/__init__.py b/xarray/__init__.py index 07793b6a6fc..15eadd115b7 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -9,7 +9,8 @@ from .core.alignment import align, broadcast, broadcast_arrays from .core.common import full_like, zeros_like, ones_like -from .core.combine import concat, auto_combine, manual_combine +from .core.concat import concat +from .core.combine import auto_combine, manual_combine from .core.computation import apply_ufunc, dot, where from .core.extensions import (register_dataarray_accessor, register_dataset_accessor) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a7c3d75b43f..43f96f41ced 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -8,14 +8,13 @@ import numpy as np -from .. import Dataset, backends, conventions +from .. import Dataset, DataArray, backends, conventions from ..core import indexing -from xarray import auto_combine -from ..core.combine import (_manual_combine, +from .. import auto_combine +from ..core.combine import (_manual_combine, _CONCAT_DIM_DEFAULT, _infer_concat_order_from_positions) from ..core.pycompat import basestring, path_type -from ..core.utils import (close_on_error, is_grib_path, is_remote_uri, - ReprObject) +from ..core.utils import (close_on_error, is_grib_path, is_remote_uri) from .common import ArrayWriter from .locks import _get_scheduler @@ -486,9 +485,6 @@ def close(self): f.close() -_CONCAT_DIM_DEFAULT = ReprObject('') - - def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', @@ -620,7 +616,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, # iterate over, while saving the originally-supplied structure as "ids" if combine is 'manual': if concat_dim is not _CONCAT_DIM_DEFAULT: - if isinstance(concat_dim, str) or concat_dim is None: + if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: concat_dim = [concat_dim] combined_ids_paths, concat_dims = _infer_concat_order_from_positions( paths, concat_dim) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 6be131a2fbe..7058232e707 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,373 +1,17 @@ from __future__ import absolute_import, division, print_function import itertools -import warnings from collections import Counter import pandas as pd import numpy as np +from .dataarray import DataArray from . import utils -from .alignment import align from .merge import merge -from .pycompat import OrderedDict, basestring, iteritems -from .variable import IndexVariable, Variable, as_variable -from .variable import concat as concat_vars - - -def concat(objs, dim=None, data_vars='all', coords='different', - compat='equals', positions=None, indexers=None, mode=None, - concat_over=None): - """Concatenate xarray objects along a new or existing dimension. - - Parameters - ---------- - objs : sequence of Dataset and DataArray objects - xarray objects to concatenate together. Each object is expected to - consist of variables and coordinates with matching shapes except for - along the concatenated dimension. - dim : str or DataArray or pandas.Index - Name of the dimension to concatenate along. This can either be a new - dimension name, in which case it is added along axis=0, or an existing - dimension name, in which case the location of the dimension is - unchanged. If dimension is provided as a DataArray or Index, its name - is used as the dimension to concatenate along and the values are added - as a coordinate. - data_vars : {'minimal', 'different', 'all' or list of str}, optional - These data variables will be concatenated together: - * 'minimal': Only data variables in which the dimension already - appears are included. - * 'different': Data variables which are not equal (ignoring - attributes) across all datasets are also concatenated (as well as - all for which dimension already appears). Beware: this option may - load the data payload of data variables into memory if they are not - already loaded. - * 'all': All data variables will be concatenated. - * list of str: The listed data variables will be concatenated, in - addition to the 'minimal' data variables. - If objects are DataArrays, data_vars must be 'all'. - coords : {'minimal', 'different', 'all' or list of str}, optional - These coordinate variables will be concatenated together: - * 'minimal': Only coordinates in which the dimension already appears - are included. - * 'different': Coordinates which are not equal (ignoring attributes) - across all datasets are also concatenated (as well as all for which - dimension already appears). Beware: this option may load the data - payload of coordinate variables into memory if they are not already - loaded. - * 'all': All coordinate variables will be concatenated, except - those corresponding to other dimensions. - * list of str: The listed coordinate variables will be concatenated, - in addition the 'minimal' coordinates. - compat : {'equals', 'identical'}, optional - String indicating how to compare non-concatenated variables and - dataset global attributes for potential conflicts. 'equals' means - that all variable values and dimensions must be the same; - 'identical' means that variable attributes and global attributes - must also be equal. - positions : None or list of integer arrays, optional - List of integer arrays which specifies the integer positions to which - to assign each dataset along the concatenated dimension. If not - supplied, objects are concatenated in the provided order. - indexers, mode, concat_over : deprecated - - Returns - ------- - concatenated : type of objs - - See also - -------- - merge - auto_combine - """ - # TODO: add join and ignore_index arguments copied from pandas.concat - # TODO: support concatenating scalar coordinates even if the concatenated - # dimension already exists - from .dataset import Dataset - from .dataarray import DataArray - - try: - first_obj, objs = utils.peek_at(objs) - except StopIteration: - raise ValueError('must supply at least one object to concatenate') - - if dim is None: - warnings.warn('the `dim` argument to `concat` will be required ' - 'in a future version of xarray; for now, setting it to ' - "the old default of 'concat_dim'", - FutureWarning, stacklevel=2) - dim = 'concat_dims' - - if indexers is not None: # pragma: nocover - warnings.warn('indexers has been renamed to positions; the alias ' - 'will be removed in a future version of xarray', - FutureWarning, stacklevel=2) - positions = indexers - - if mode is not None: - raise ValueError('`mode` is no longer a valid argument to ' - 'xarray.concat; it has been split into the ' - '`data_vars` and `coords` arguments') - if concat_over is not None: - raise ValueError('`concat_over` is no longer a valid argument to ' - 'xarray.concat; it has been split into the ' - '`data_vars` and `coords` arguments') - - if isinstance(first_obj, DataArray): - f = _dataarray_concat - elif isinstance(first_obj, Dataset): - f = _dataset_concat - else: - raise TypeError('can only concatenate xarray Dataset and DataArray ' - 'objects, got %s' % type(first_obj)) - return f(objs, dim, data_vars, coords, compat, positions) - - -def _calc_concat_dim_coord(dim): - """ - Infer the dimension name and 1d coordinate variable (if appropriate) - for concatenating along the new dimension. - """ - from .dataarray import DataArray - - if isinstance(dim, basestring): - coord = None - elif not isinstance(dim, (DataArray, Variable)): - dim_name = getattr(dim, 'name', None) - if dim_name is None: - dim_name = 'concat_dim' - coord = IndexVariable(dim_name, dim) - dim = dim_name - elif not isinstance(dim, DataArray): - coord = as_variable(dim).to_index_variable() - dim, = coord.dims - else: - coord = dim - dim, = coord.dims - return dim, coord - - -def _calc_concat_over(datasets, dim, data_vars, coords): - """ - Determine which dataset variables need to be concatenated in the result, - and which can simply be taken from the first dataset. - """ - # Return values - concat_over = set() - equals = {} - - if dim in datasets[0]: - concat_over.add(dim) - for ds in datasets: - concat_over.update(k for k, v in ds.variables.items() - if dim in v.dims) - - def process_subset_opt(opt, subset): - if isinstance(opt, basestring): - if opt == 'different': - # all nonindexes that are not the same in each dataset - for k in getattr(datasets[0], subset): - if k not in concat_over: - # Compare the variable of all datasets vs. the one - # of the first dataset. Perform the minimum amount of - # loads in order to avoid multiple loads from disk - # while keeping the RAM footprint low. - v_lhs = datasets[0].variables[k].load() - # We'll need to know later on if variables are equal. - computed = [] - for ds_rhs in datasets[1:]: - v_rhs = ds_rhs.variables[k].compute() - computed.append(v_rhs) - if not v_lhs.equals(v_rhs): - concat_over.add(k) - equals[k] = False - # computed variables are not to be re-computed - # again in the future - for ds, v in zip(datasets[1:], computed): - ds.variables[k].data = v.data - break - else: - equals[k] = True - - elif opt == 'all': - concat_over.update(set(getattr(datasets[0], subset)) - - set(datasets[0].dims)) - elif opt == 'minimal': - pass - else: - raise ValueError("unexpected value for %s: %s" % (subset, opt)) - else: - invalid_vars = [k for k in opt - if k not in getattr(datasets[0], subset)] - if invalid_vars: - if subset == 'coords': - raise ValueError( - 'some variables in coords are not coordinates on ' - 'the first dataset: %s' % (invalid_vars,)) - else: - raise ValueError( - 'some variables in data_vars are not data variables ' - 'on the first dataset: %s' % (invalid_vars,)) - concat_over.update(opt) - - process_subset_opt(data_vars, 'data_vars') - process_subset_opt(coords, 'coords') - return concat_over, equals - - -def _dataset_concat(datasets, dim, data_vars, coords, compat, positions): - """ - Concatenate a sequence of datasets along a new or existing dimension - """ - from .dataset import Dataset - - if compat not in ['equals', 'identical']: - raise ValueError("compat=%r invalid: must be 'equals' " - "or 'identical'" % compat) - - dim, coord = _calc_concat_dim_coord(dim) - # Make sure we're working on a copy (we'll be loading variables) - datasets = [ds.copy() for ds in datasets] - datasets = align(*datasets, join='outer', copy=False, exclude=[dim]) - - concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords) - - def insert_result_variable(k, v): - assert isinstance(v, Variable) - if k in datasets[0].coords: - result_coord_names.add(k) - result_vars[k] = v - - # create the new dataset and add constant variables - result_vars = OrderedDict() - result_coord_names = set(datasets[0].coords) - result_attrs = datasets[0].attrs - result_encoding = datasets[0].encoding - - for k, v in datasets[0].variables.items(): - if k not in concat_over: - insert_result_variable(k, v) - - # check that global attributes and non-concatenated variables are fixed - # across all datasets - for ds in datasets[1:]: - if (compat == 'identical' and - not utils.dict_equiv(ds.attrs, result_attrs)): - raise ValueError('dataset global attributes not equal') - for k, v in iteritems(ds.variables): - if k not in result_vars and k not in concat_over: - raise ValueError('encountered unexpected variable %r' % k) - elif (k in result_coord_names) != (k in ds.coords): - raise ValueError('%r is a coordinate in some datasets but not ' - 'others' % k) - elif k in result_vars and k != dim: - # Don't use Variable.identical as it internally invokes - # Variable.equals, and we may already know the answer - if compat == 'identical' and not utils.dict_equiv( - v.attrs, result_vars[k].attrs): - raise ValueError( - 'variable %s not identical across datasets' % k) - - # Proceed with equals() - try: - # May be populated when using the "different" method - is_equal = equals[k] - except KeyError: - result_vars[k].load() - is_equal = v.equals(result_vars[k]) - if not is_equal: - raise ValueError( - 'variable %s not equal across datasets' % k) - - # we've already verified everything is consistent; now, calculate - # shared dimension sizes so we can expand the necessary variables - dim_lengths = [ds.dims.get(dim, 1) for ds in datasets] - non_concat_dims = {} - for ds in datasets: - non_concat_dims.update(ds.dims) - non_concat_dims.pop(dim, None) - - def ensure_common_dims(vars): - # ensure each variable with the given name shares the same - # dimensions and the same shape for all of them except along the - # concat dimension - common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) - if dim not in common_dims: - common_dims = (dim,) + common_dims - for var, dim_len in zip(vars, dim_lengths): - if var.dims != common_dims: - common_shape = tuple(non_concat_dims.get(d, dim_len) - for d in common_dims) - var = var.set_dims(common_dims, common_shape) - yield var - - # stack up each variable to fill-out the dataset (in order) - for k in datasets[0].variables: - if k in concat_over: - vars = ensure_common_dims([ds.variables[k] for ds in datasets]) - combined = concat_vars(vars, dim, positions) - insert_result_variable(k, combined) - - result = Dataset(result_vars, attrs=result_attrs) - result = result.set_coords(result_coord_names) - result.encoding = result_encoding - - if coord is not None: - # add concat dimension last to ensure that its in the final Dataset - result[coord.name] = coord - - return result - - -def _dataarray_concat(arrays, dim, data_vars, coords, compat, - positions): - arrays = list(arrays) - - if data_vars != 'all': - raise ValueError('data_vars is not a valid argument when ' - 'concatenating DataArray objects') - - datasets = [] - for n, arr in enumerate(arrays): - if n == 0: - name = arr.name - elif name != arr.name: - if compat == 'identical': - raise ValueError('array names not identical') - else: - arr = arr.rename(name) - datasets.append(arr._to_temp_dataset()) - - ds = _dataset_concat(datasets, dim, data_vars, coords, compat, - positions) - return arrays[0]._from_temp_dataset(ds, name) - - -def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): - if len(datasets) == 1 and dim is None: - # There is nothing more to combine, so kick out early. - return datasets[0] - else: - if dim is None: - ds0 = datasets[0] - ds1 = datasets[1] - concat_dims = set(ds0.dims) - if ds0.dims != ds1.dims: - dim_tuples = set(ds0.dims.items()) - set(ds1.dims.items()) - concat_dims = set(i for i, _ in dim_tuples) - if len(concat_dims) > 1: - concat_dims = set(d for d in concat_dims - if not ds0[d].equals(ds1[d])) - if len(concat_dims) > 1: - raise ValueError('too many different dimensions to ' - 'concatenate: %s' % concat_dims) - elif len(concat_dims) == 0: - raise ValueError('cannot infer dimension to concatenate: ' - 'supply the ``concat_dim`` argument ' - 'explicitly') - dim, = concat_dims - return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) +from .concat import _auto_concat +from .pycompat import OrderedDict _CONCAT_DIM_DEFAULT = utils.ReprObject('') @@ -383,6 +27,7 @@ def _infer_concat_order_from_positions(datasets, concat_dims): if concat_dims is _CONCAT_DIM_DEFAULT: concat_dims = [_CONCAT_DIM_DEFAULT] * n_dims else: + print(concat_dims) if len(concat_dims) != n_dims: raise ValueError("concat_dims has length " + str(len(concat_dims)) + " but the datasets passed are nested in a " + @@ -714,7 +359,7 @@ def manual_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, """ if concat_dim is not _CONCAT_DIM_DEFAULT: - if isinstance(concat_dim, str) or concat_dim is None: + if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: concat_dim = [concat_dim] # The IDs argument tells _manual_combine that datasets aren't yet sorted diff --git a/xarray/core/concat.py b/xarray/core/concat.py new file mode 100644 index 00000000000..cfd4b6612e8 --- /dev/null +++ b/xarray/core/concat.py @@ -0,0 +1,366 @@ +from __future__ import absolute_import, division, print_function + +import warnings + +import pandas as pd + + +from . import utils +from .alignment import align +from .pycompat import OrderedDict, basestring, iteritems +from .variable import IndexVariable, Variable, as_variable +from .variable import concat as concat_vars + + +def concat(objs, dim=None, data_vars='all', coords='different', + compat='equals', positions=None, indexers=None, mode=None, + concat_over=None): + """Concatenate xarray objects along a new or existing dimension. + + Parameters + ---------- + objs : sequence of Dataset and DataArray objects + xarray objects to concatenate together. Each object is expected to + consist of variables and coordinates with matching shapes except for + along the concatenated dimension. + dim : str or DataArray or pandas.Index + Name of the dimension to concatenate along. This can either be a new + dimension name, in which case it is added along axis=0, or an existing + dimension name, in which case the location of the dimension is + unchanged. If dimension is provided as a DataArray or Index, its name + is used as the dimension to concatenate along and the values are added + as a coordinate. + data_vars : {'minimal', 'different', 'all' or list of str}, optional + These data variables will be concatenated together: + * 'minimal': Only data variables in which the dimension already + appears are included. + * 'different': Data variables which are not equal (ignoring + attributes) across all datasets are also concatenated (as well as + all for which dimension already appears). Beware: this option may + load the data payload of data variables into memory if they are not + already loaded. + * 'all': All data variables will be concatenated. + * list of str: The listed data variables will be concatenated, in + addition to the 'minimal' data variables. + If objects are DataArrays, data_vars must be 'all'. + coords : {'minimal', 'different', 'all' or list of str}, optional + These coordinate variables will be concatenated together: + * 'minimal': Only coordinates in which the dimension already appears + are included. + * 'different': Coordinates which are not equal (ignoring attributes) + across all datasets are also concatenated (as well as all for which + dimension already appears). Beware: this option may load the data + payload of coordinate variables into memory if they are not already + loaded. + * 'all': All coordinate variables will be concatenated, except + those corresponding to other dimensions. + * list of str: The listed coordinate variables will be concatenated, + in addition the 'minimal' coordinates. + compat : {'equals', 'identical'}, optional + String indicating how to compare non-concatenated variables and + dataset global attributes for potential conflicts. 'equals' means + that all variable values and dimensions must be the same; + 'identical' means that variable attributes and global attributes + must also be equal. + positions : None or list of integer arrays, optional + List of integer arrays which specifies the integer positions to which + to assign each dataset along the concatenated dimension. If not + supplied, objects are concatenated in the provided order. + indexers, mode, concat_over : deprecated + + Returns + ------- + concatenated : type of objs + + See also + -------- + merge + auto_combine + """ + # TODO: add join and ignore_index arguments copied from pandas.concat + # TODO: support concatenating scalar coordinates even if the concatenated + # dimension already exists + from .dataset import Dataset + from .dataarray import DataArray + + try: + first_obj, objs = utils.peek_at(objs) + except StopIteration: + raise ValueError('must supply at least one object to concatenate') + + if dim is None: + warnings.warn('the `dim` argument to `concat` will be required ' + 'in a future version of xarray; for now, setting it to ' + "the old default of 'concat_dim'", + FutureWarning, stacklevel=2) + dim = 'concat_dims' + + if indexers is not None: # pragma: nocover + warnings.warn('indexers has been renamed to positions; the alias ' + 'will be removed in a future version of xarray', + FutureWarning, stacklevel=2) + positions = indexers + + if mode is not None: + raise ValueError('`mode` is no longer a valid argument to ' + 'xarray.concat; it has been split into the ' + '`data_vars` and `coords` arguments') + if concat_over is not None: + raise ValueError('`concat_over` is no longer a valid argument to ' + 'xarray.concat; it has been split into the ' + '`data_vars` and `coords` arguments') + + if isinstance(first_obj, DataArray): + f = _dataarray_concat + elif isinstance(first_obj, Dataset): + f = _dataset_concat + else: + raise TypeError('can only concatenate xarray Dataset and DataArray ' + 'objects, got %s' % type(first_obj)) + return f(objs, dim, data_vars, coords, compat, positions) + + +def _calc_concat_dim_coord(dim): + """ + Infer the dimension name and 1d coordinate variable (if appropriate) + for concatenating along the new dimension. + """ + from .dataarray import DataArray + + if isinstance(dim, basestring): + coord = None + elif not isinstance(dim, (DataArray, Variable)): + dim_name = getattr(dim, 'name', None) + if dim_name is None: + dim_name = 'concat_dim' + coord = IndexVariable(dim_name, dim) + dim = dim_name + elif not isinstance(dim, DataArray): + coord = as_variable(dim).to_index_variable() + dim, = coord.dims + else: + coord = dim + dim, = coord.dims + return dim, coord + + +def _calc_concat_over(datasets, dim, data_vars, coords): + """ + Determine which dataset variables need to be concatenated in the result, + and which can simply be taken from the first dataset. + """ + # Return values + concat_over = set() + equals = {} + + if dim in datasets[0]: + concat_over.add(dim) + for ds in datasets: + concat_over.update(k for k, v in ds.variables.items() + if dim in v.dims) + + def process_subset_opt(opt, subset): + if isinstance(opt, basestring): + if opt == 'different': + # all nonindexes that are not the same in each dataset + for k in getattr(datasets[0], subset): + if k not in concat_over: + # Compare the variable of all datasets vs. the one + # of the first dataset. Perform the minimum amount of + # loads in order to avoid multiple loads from disk + # while keeping the RAM footprint low. + v_lhs = datasets[0].variables[k].load() + # We'll need to know later on if variables are equal. + computed = [] + for ds_rhs in datasets[1:]: + v_rhs = ds_rhs.variables[k].compute() + computed.append(v_rhs) + if not v_lhs.equals(v_rhs): + concat_over.add(k) + equals[k] = False + # computed variables are not to be re-computed + # again in the future + for ds, v in zip(datasets[1:], computed): + ds.variables[k].data = v.data + break + else: + equals[k] = True + + elif opt == 'all': + concat_over.update(set(getattr(datasets[0], subset)) - + set(datasets[0].dims)) + elif opt == 'minimal': + pass + else: + raise ValueError("unexpected value for %s: %s" % (subset, opt)) + else: + invalid_vars = [k for k in opt + if k not in getattr(datasets[0], subset)] + if invalid_vars: + if subset == 'coords': + raise ValueError( + 'some variables in coords are not coordinates on ' + 'the first dataset: %s' % (invalid_vars,)) + else: + raise ValueError( + 'some variables in data_vars are not data variables ' + 'on the first dataset: %s' % (invalid_vars,)) + concat_over.update(opt) + + process_subset_opt(data_vars, 'data_vars') + process_subset_opt(coords, 'coords') + return concat_over, equals + + +def _dataset_concat(datasets, dim, data_vars, coords, compat, positions): + """ + Concatenate a sequence of datasets along a new or existing dimension + """ + from .dataset import Dataset + + if compat not in ['equals', 'identical']: + raise ValueError("compat=%r invalid: must be 'equals' " + "or 'identical'" % compat) + + dim, coord = _calc_concat_dim_coord(dim) + # Make sure we're working on a copy (we'll be loading variables) + datasets = [ds.copy() for ds in datasets] + datasets = align(*datasets, join='outer', copy=False, exclude=[dim]) + + concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords) + + def insert_result_variable(k, v): + assert isinstance(v, Variable) + if k in datasets[0].coords: + result_coord_names.add(k) + result_vars[k] = v + + # create the new dataset and add constant variables + result_vars = OrderedDict() + result_coord_names = set(datasets[0].coords) + result_attrs = datasets[0].attrs + result_encoding = datasets[0].encoding + + for k, v in datasets[0].variables.items(): + if k not in concat_over: + insert_result_variable(k, v) + + # check that global attributes and non-concatenated variables are fixed + # across all datasets + for ds in datasets[1:]: + if (compat == 'identical' and + not utils.dict_equiv(ds.attrs, result_attrs)): + raise ValueError('dataset global attributes not equal') + for k, v in iteritems(ds.variables): + if k not in result_vars and k not in concat_over: + raise ValueError('encountered unexpected variable %r' % k) + elif (k in result_coord_names) != (k in ds.coords): + raise ValueError('%r is a coordinate in some datasets but not ' + 'others' % k) + elif k in result_vars and k != dim: + # Don't use Variable.identical as it internally invokes + # Variable.equals, and we may already know the answer + if compat == 'identical' and not utils.dict_equiv( + v.attrs, result_vars[k].attrs): + raise ValueError( + 'variable %s not identical across datasets' % k) + + # Proceed with equals() + try: + # May be populated when using the "different" method + is_equal = equals[k] + except KeyError: + result_vars[k].load() + is_equal = v.equals(result_vars[k]) + if not is_equal: + raise ValueError( + 'variable %s not equal across datasets' % k) + + # we've already verified everything is consistent; now, calculate + # shared dimension sizes so we can expand the necessary variables + dim_lengths = [ds.dims.get(dim, 1) for ds in datasets] + non_concat_dims = {} + for ds in datasets: + non_concat_dims.update(ds.dims) + non_concat_dims.pop(dim, None) + + def ensure_common_dims(vars): + # ensure each variable with the given name shares the same + # dimensions and the same shape for all of them except along the + # concat dimension + common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) + if dim not in common_dims: + common_dims = (dim,) + common_dims + for var, dim_len in zip(vars, dim_lengths): + if var.dims != common_dims: + common_shape = tuple(non_concat_dims.get(d, dim_len) + for d in common_dims) + var = var.set_dims(common_dims, common_shape) + yield var + + # stack up each variable to fill-out the dataset (in order) + for k in datasets[0].variables: + if k in concat_over: + vars = ensure_common_dims([ds.variables[k] for ds in datasets]) + combined = concat_vars(vars, dim, positions) + insert_result_variable(k, combined) + + result = Dataset(result_vars, attrs=result_attrs) + result = result.set_coords(result_coord_names) + result.encoding = result_encoding + + if coord is not None: + # add concat dimension last to ensure that its in the final Dataset + result[coord.name] = coord + + return result + + +def _dataarray_concat(arrays, dim, data_vars, coords, compat, + positions): + arrays = list(arrays) + + if data_vars != 'all': + raise ValueError('data_vars is not a valid argument when ' + 'concatenating DataArray objects') + + datasets = [] + for n, arr in enumerate(arrays): + if n == 0: + name = arr.name + elif name != arr.name: + if compat == 'identical': + raise ValueError('array names not identical') + else: + arr = arr.rename(name) + datasets.append(arr._to_temp_dataset()) + + ds = _dataset_concat(datasets, dim, data_vars, coords, compat, + positions) + return arrays[0]._from_temp_dataset(ds, name) + + +def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): + if len(datasets) == 1 and dim is None: + # There is nothing more to combine, so kick out early. + return datasets[0] + else: + if dim is None: + ds0 = datasets[0] + ds1 = datasets[1] + concat_dims = set(ds0.dims) + if ds0.dims != ds1.dims: + dim_tuples = set(ds0.dims.items()) - set(ds1.dims.items()) + concat_dims = set(i for i, _ in dim_tuples) + if len(concat_dims) > 1: + concat_dims = set(d for d in concat_dims + if not ds0[d].equals(ds1[d])) + if len(concat_dims) > 1: + raise ValueError('too many different dimensions to ' + 'concatenate: %s' % concat_dims) + elif len(concat_dims) == 0: + raise ValueError('cannot infer dimension to concatenate: ' + 'supply the ``concat_dim`` argument ' + 'explicitly') + dim, = concat_dims + return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index ec8329d6805..d3de985a88e 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -9,7 +9,7 @@ from . import dtypes, duck_array_ops, nputils, ops, utils from .arithmetic import SupportsArithmetic -from .combine import concat +from .concat import concat from .common import ALL_DIMS, ImplementsArrayReduce, ImplementsDatasetReduce from .options import _get_keep_attrs from .pycompat import integer_types, range, zip diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1c4fc1bdfe7..ddadb0773f1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2411,7 +2411,8 @@ def test_open_multi_dataset(self): create_tmp_file() as tmp2: original.to_netcdf(tmp1) original.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim=dim) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim=dim) as actual: assert_identical(expected, actual) def test_dask_roundtrip(self): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 005d996c579..afcdd4eb8f7 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -651,7 +651,12 @@ def test_manual_concat_along_new_dim(self): actual = manual_combine(objs, concat_dim='t') assert_identical(expected, actual) - # TODO same thing but with a DataArray as the new dim + # Same but with a DataArray as new dim, see GH #1988 and #2647 + dim = DataArray([100, 150], name='baz', dims='baz') + expected = Dataset({'a': (('baz', 'x'), [[10], [20]]), + 'x': [0], 'baz': [100, 150]}) + actual = manual_combine(objs, concat_dim=dim) + assert_identical(expected, actual) def test_manual_merge(self): data = Dataset({'x': 0}) From eb053cc598f0f172260de0d1682b2afa3e89469d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 7 Jan 2019 00:54:06 +0000 Subject: [PATCH 49/96] Separated the tests for concat and both combines --- xarray/tests/test_combine.py | 290 +--------------------------------- xarray/tests/test_concat.py | 295 +++++++++++++++++++++++++++++++++++ 2 files changed, 299 insertions(+), 286 deletions(-) create mode 100644 xarray/tests/test_concat.py diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index afcdd4eb8f7..46b9a02431a 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -8,301 +8,19 @@ import pandas as pd import pytest -from xarray import (DataArray, Dataset, Variable, concat, auto_combine, - manual_combine) +from xarray import DataArray, Dataset, concat, auto_combine, manual_combine from xarray.core.combine import ( _new_tile_id, _check_shape_tile_ids, _combine_all_along_first_dim, _combine_nd, _all_arrays_equal, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, _infer_concat_order_from_coords, _infer_order_1d) -from xarray.core.pycompat import OrderedDict, iteritems +from xarray.core.pycompat import OrderedDict -from . import ( - InaccessibleArray, assert_array_equal, assert_combined_tile_ids_equal, - assert_equal, assert_identical, raises_regex, requires_dask) +from . import (assert_combined_tile_ids_equal, assert_identical, assert_equal, + raises_regex) from .test_dataset import create_test_data -class TestConcatDataset(object): - def test_concat(self): - # TODO: simplify and split this test case - - # drop the third dimension to keep things relatively understandable - data = create_test_data() - for k in list(data.variables): - if 'dim3' in data[k].dims: - del data[k] - - split_data = [data.isel(dim1=slice(3)), - data.isel(dim1=slice(3, None))] - assert_identical(data, concat(split_data, 'dim1')) - - def rectify_dim_order(dataset): - # return a new dataset with all variable dimensions transposed into - # the order in which they are found in `data` - return Dataset(dict((k, v.transpose(*data[k].dims)) - for k, v in iteritems(dataset.data_vars)), - dataset.coords, attrs=dataset.attrs) - - for dim in ['dim1', 'dim2']: - datasets = [g for _, g in data.groupby(dim, squeeze=False)] - assert_identical(data, concat(datasets, dim)) - - dim = 'dim2' - assert_identical( - data, concat(datasets, data[dim])) - assert_identical( - data, concat(datasets, data[dim], coords='minimal')) - - datasets = [g for _, g in data.groupby(dim, squeeze=True)] - concat_over = [k for k, v in iteritems(data.coords) - if dim in v.dims and k != dim] - actual = concat(datasets, data[dim], coords=concat_over) - assert_identical(data, rectify_dim_order(actual)) - - actual = concat(datasets, data[dim], coords='different') - assert_identical(data, rectify_dim_order(actual)) - - # make sure the coords argument behaves as expected - data.coords['extra'] = ('dim4', np.arange(3)) - for dim in ['dim1', 'dim2']: - datasets = [g for _, g in data.groupby(dim, squeeze=True)] - actual = concat(datasets, data[dim], coords='all') - expected = np.array([data['extra'].values - for _ in range(data.dims[dim])]) - assert_array_equal(actual['extra'].values, expected) - - actual = concat(datasets, data[dim], coords='different') - assert_equal(data['extra'], actual['extra']) - actual = concat(datasets, data[dim], coords='minimal') - assert_equal(data['extra'], actual['extra']) - - # verify that the dim argument takes precedence over - # concatenating dataset variables of the same name - dim = (2 * data['dim1']).rename('dim1') - datasets = [g for _, g in data.groupby('dim1', squeeze=False)] - expected = data.copy() - expected['dim1'] = dim - assert_identical(expected, concat(datasets, dim)) - - def test_concat_data_vars(self): - data = Dataset({'foo': ('x', np.random.randn(10))}) - objs = [data.isel(x=slice(5)), data.isel(x=slice(5, None))] - for data_vars in ['minimal', 'different', 'all', [], ['foo']]: - actual = concat(objs, dim='x', data_vars=data_vars) - assert_identical(data, actual) - - def test_concat_coords(self): - data = Dataset({'foo': ('x', np.random.randn(10))}) - expected = data.assign_coords(c=('x', [0] * 5 + [1] * 5)) - objs = [data.isel(x=slice(5)).assign_coords(c=0), - data.isel(x=slice(5, None)).assign_coords(c=1)] - for coords in ['different', 'all', ['c']]: - actual = concat(objs, dim='x', coords=coords) - assert_identical(expected, actual) - for coords in ['minimal', []]: - with raises_regex(ValueError, 'not equal across'): - concat(objs, dim='x', coords=coords) - - def test_concat_constant_index(self): - # GH425 - ds1 = Dataset({'foo': 1.5}, {'y': 1}) - ds2 = Dataset({'foo': 2.5}, {'y': 1}) - expected = Dataset({'foo': ('y', [1.5, 2.5]), 'y': [1, 1]}) - for mode in ['different', 'all', ['foo']]: - actual = concat([ds1, ds2], 'y', data_vars=mode) - assert_identical(expected, actual) - with raises_regex(ValueError, 'not equal across datasets'): - concat([ds1, ds2], 'y', data_vars='minimal') - - def test_concat_size0(self): - data = create_test_data() - split_data = [data.isel(dim1=slice(0, 0)), data] - actual = concat(split_data, 'dim1') - assert_identical(data, actual) - - actual = concat(split_data[::-1], 'dim1') - assert_identical(data, actual) - - def test_concat_autoalign(self): - ds1 = Dataset({'foo': DataArray([1, 2], coords=[('x', [1, 2])])}) - ds2 = Dataset({'foo': DataArray([1, 2], coords=[('x', [1, 3])])}) - actual = concat([ds1, ds2], 'y') - expected = Dataset({'foo': DataArray([[1, 2, np.nan], [1, np.nan, 2]], - dims=['y', 'x'], - coords={'x': [1, 2, 3]})}) - assert_identical(expected, actual) - - def test_concat_errors(self): - data = create_test_data() - split_data = [data.isel(dim1=slice(3)), - data.isel(dim1=slice(3, None))] - - with raises_regex(ValueError, 'must supply at least one'): - concat([], 'dim1') - - with raises_regex(ValueError, 'are not coordinates'): - concat([data, data], 'new_dim', coords=['not_found']) - - with raises_regex(ValueError, 'global attributes not'): - data0, data1 = deepcopy(split_data) - data1.attrs['foo'] = 'bar' - concat([data0, data1], 'dim1', compat='identical') - assert_identical( - data, concat([data0, data1], 'dim1', compat='equals')) - - with raises_regex(ValueError, 'encountered unexpected'): - data0, data1 = deepcopy(split_data) - data1['foo'] = ('bar', np.random.randn(10)) - concat([data0, data1], 'dim1') - - with raises_regex(ValueError, 'compat.* invalid'): - concat(split_data, 'dim1', compat='foobar') - - with raises_regex(ValueError, 'unexpected value for'): - concat([data, data], 'new_dim', coords='foobar') - - with raises_regex( - ValueError, 'coordinate in some datasets but not others'): - concat([Dataset({'x': 0}), Dataset({'x': [1]})], dim='z') - - with raises_regex( - ValueError, 'coordinate in some datasets but not others'): - concat([Dataset({'x': 0}), Dataset({}, {'x': 1})], dim='z') - - with raises_regex(ValueError, 'no longer a valid'): - concat([data, data], 'new_dim', mode='different') - with raises_regex(ValueError, 'no longer a valid'): - concat([data, data], 'new_dim', concat_over='different') - - def test_concat_promote_shape(self): - # mixed dims within variables - objs = [Dataset({}, {'x': 0}), Dataset({'x': [1]})] - actual = concat(objs, 'x') - expected = Dataset({'x': [0, 1]}) - assert_identical(actual, expected) - - objs = [Dataset({'x': [0]}), Dataset({}, {'x': 1})] - actual = concat(objs, 'x') - assert_identical(actual, expected) - - # mixed dims between variables - objs = [Dataset({'x': [2], 'y': 3}), Dataset({'x': [4], 'y': 5})] - actual = concat(objs, 'x') - expected = Dataset({'x': [2, 4], 'y': ('x', [3, 5])}) - assert_identical(actual, expected) - - # mixed dims in coord variable - objs = [Dataset({'x': [0]}, {'y': -1}), - Dataset({'x': [1]}, {'y': ('x', [-2])})] - actual = concat(objs, 'x') - expected = Dataset({'x': [0, 1]}, {'y': ('x', [-1, -2])}) - assert_identical(actual, expected) - - # scalars with mixed lengths along concat dim -- values should repeat - objs = [Dataset({'x': [0]}, {'y': -1}), - Dataset({'x': [1, 2]}, {'y': -2})] - actual = concat(objs, 'x') - expected = Dataset({'x': [0, 1, 2]}, {'y': ('x', [-1, -2, -2])}) - assert_identical(actual, expected) - - # broadcast 1d x 1d -> 2d - objs = [Dataset({'z': ('x', [-1])}, {'x': [0], 'y': [0]}), - Dataset({'z': ('y', [1])}, {'x': [1], 'y': [0]})] - actual = concat(objs, 'x') - expected = Dataset({'z': (('x', 'y'), [[-1], [1]])}, - {'x': [0, 1], 'y': [0]}) - assert_identical(actual, expected) - - def test_concat_do_not_promote(self): - # GH438 - objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), - Dataset({'y': ('t', [2])}, {'x': 1, 't': [0]})] - expected = Dataset({'y': ('t', [1, 2])}, {'x': 1, 't': [0, 0]}) - actual = concat(objs, 't') - assert_identical(expected, actual) - - objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), - Dataset({'y': ('t', [2])}, {'x': 2, 't': [0]})] - with pytest.raises(ValueError): - concat(objs, 't', coords='minimal') - - def test_concat_dim_is_variable(self): - objs = [Dataset({'x': 0}), Dataset({'x': 1})] - coord = Variable('y', [3, 4]) - expected = Dataset({'x': ('y', [0, 1]), 'y': [3, 4]}) - actual = concat(objs, coord) - assert_identical(actual, expected) - - def test_concat_multiindex(self): - x = pd.MultiIndex.from_product([[1, 2, 3], ['a', 'b']]) - expected = Dataset({'x': x}) - actual = concat([expected.isel(x=slice(2)), - expected.isel(x=slice(2, None))], 'x') - assert expected.equals(actual) - assert isinstance(actual.x.to_index(), pd.MultiIndex) - - -class TestConcatDataArray(object): - def test_concat(self): - ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), - 'bar': (['x', 'y'], np.random.random((2, 3)))}, - {'x': [0, 1]}) - foo = ds['foo'] - bar = ds['bar'] - - # from dataset array: - expected = DataArray(np.array([foo.values, bar.values]), - dims=['w', 'x', 'y'], coords={'x': [0, 1]}) - actual = concat([foo, bar], 'w') - assert_equal(expected, actual) - # from iteration: - grouped = [g for _, g in foo.groupby('x')] - stacked = concat(grouped, ds['x']) - assert_identical(foo, stacked) - # with an index as the 'dim' argument - stacked = concat(grouped, ds.indexes['x']) - assert_identical(foo, stacked) - - actual = concat([foo[0], foo[1]], pd.Index([0, 1]) - ).reset_coords(drop=True) - expected = foo[:2].rename({'x': 'concat_dim'}) - assert_identical(expected, actual) - - actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True) - expected = foo[:2].rename({'x': 'concat_dim'}) - assert_identical(expected, actual) - - with raises_regex(ValueError, 'not identical'): - concat([foo, bar], dim='w', compat='identical') - - with raises_regex(ValueError, 'not a valid argument'): - concat([foo, bar], dim='w', data_vars='minimal') - - def test_concat_encoding(self): - # Regression test for GH1297 - ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), - 'bar': (['x', 'y'], np.random.random((2, 3)))}, - {'x': [0, 1]}) - foo = ds['foo'] - foo.encoding = {"complevel": 5} - ds.encoding = {"unlimited_dims": 'x'} - assert concat([foo, foo], dim="x").encoding == foo.encoding - assert concat([ds, ds], dim="x").encoding == ds.encoding - - @requires_dask - def test_concat_lazy(self): - import dask.array as da - - arrays = [DataArray( - da.from_array(InaccessibleArray(np.zeros((3, 3))), 3), - dims=['x', 'y']) for _ in range(2)] - # should not raise - combined = concat(arrays, dim='z') - assert combined.shape == (2, 3, 3) - assert combined.dims == ('z', 'x', 'y') - - class TestTileIDsFromNestedList(object): def test_1d(self): ds = create_test_data diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py new file mode 100644 index 00000000000..d2b80472a76 --- /dev/null +++ b/xarray/tests/test_concat.py @@ -0,0 +1,295 @@ +from __future__ import absolute_import, division, print_function + +from copy import deepcopy + +import numpy as np +import pandas as pd +import pytest + +from xarray import DataArray, Dataset, Variable, concat +from xarray.core.pycompat import iteritems + +from . import ( + InaccessibleArray, assert_array_equal, + assert_equal, assert_identical, raises_regex, requires_dask) +from .test_dataset import create_test_data + + +class TestConcatDataset(object): + def test_concat(self): + # TODO: simplify and split this test case + + # drop the third dimension to keep things relatively understandable + data = create_test_data() + for k in list(data.variables): + if 'dim3' in data[k].dims: + del data[k] + + split_data = [data.isel(dim1=slice(3)), + data.isel(dim1=slice(3, None))] + assert_identical(data, concat(split_data, 'dim1')) + + def rectify_dim_order(dataset): + # return a new dataset with all variable dimensions transposed into + # the order in which they are found in `data` + return Dataset(dict((k, v.transpose(*data[k].dims)) + for k, v in iteritems(dataset.data_vars)), + dataset.coords, attrs=dataset.attrs) + + for dim in ['dim1', 'dim2']: + datasets = [g for _, g in data.groupby(dim, squeeze=False)] + assert_identical(data, concat(datasets, dim)) + + dim = 'dim2' + assert_identical( + data, concat(datasets, data[dim])) + assert_identical( + data, concat(datasets, data[dim], coords='minimal')) + + datasets = [g for _, g in data.groupby(dim, squeeze=True)] + concat_over = [k for k, v in iteritems(data.coords) + if dim in v.dims and k != dim] + actual = concat(datasets, data[dim], coords=concat_over) + assert_identical(data, rectify_dim_order(actual)) + + actual = concat(datasets, data[dim], coords='different') + assert_identical(data, rectify_dim_order(actual)) + + # make sure the coords argument behaves as expected + data.coords['extra'] = ('dim4', np.arange(3)) + for dim in ['dim1', 'dim2']: + datasets = [g for _, g in data.groupby(dim, squeeze=True)] + actual = concat(datasets, data[dim], coords='all') + expected = np.array([data['extra'].values + for _ in range(data.dims[dim])]) + assert_array_equal(actual['extra'].values, expected) + + actual = concat(datasets, data[dim], coords='different') + assert_equal(data['extra'], actual['extra']) + actual = concat(datasets, data[dim], coords='minimal') + assert_equal(data['extra'], actual['extra']) + + # verify that the dim argument takes precedence over + # concatenating dataset variables of the same name + dim = (2 * data['dim1']).rename('dim1') + datasets = [g for _, g in data.groupby('dim1', squeeze=False)] + expected = data.copy() + expected['dim1'] = dim + assert_identical(expected, concat(datasets, dim)) + + def test_concat_data_vars(self): + data = Dataset({'foo': ('x', np.random.randn(10))}) + objs = [data.isel(x=slice(5)), data.isel(x=slice(5, None))] + for data_vars in ['minimal', 'different', 'all', [], ['foo']]: + actual = concat(objs, dim='x', data_vars=data_vars) + assert_identical(data, actual) + + def test_concat_coords(self): + data = Dataset({'foo': ('x', np.random.randn(10))}) + expected = data.assign_coords(c=('x', [0] * 5 + [1] * 5)) + objs = [data.isel(x=slice(5)).assign_coords(c=0), + data.isel(x=slice(5, None)).assign_coords(c=1)] + for coords in ['different', 'all', ['c']]: + actual = concat(objs, dim='x', coords=coords) + assert_identical(expected, actual) + for coords in ['minimal', []]: + with raises_regex(ValueError, 'not equal across'): + concat(objs, dim='x', coords=coords) + + def test_concat_constant_index(self): + # GH425 + ds1 = Dataset({'foo': 1.5}, {'y': 1}) + ds2 = Dataset({'foo': 2.5}, {'y': 1}) + expected = Dataset({'foo': ('y', [1.5, 2.5]), 'y': [1, 1]}) + for mode in ['different', 'all', ['foo']]: + actual = concat([ds1, ds2], 'y', data_vars=mode) + assert_identical(expected, actual) + with raises_regex(ValueError, 'not equal across datasets'): + concat([ds1, ds2], 'y', data_vars='minimal') + + def test_concat_size0(self): + data = create_test_data() + split_data = [data.isel(dim1=slice(0, 0)), data] + actual = concat(split_data, 'dim1') + assert_identical(data, actual) + + actual = concat(split_data[::-1], 'dim1') + assert_identical(data, actual) + + def test_concat_autoalign(self): + ds1 = Dataset({'foo': DataArray([1, 2], coords=[('x', [1, 2])])}) + ds2 = Dataset({'foo': DataArray([1, 2], coords=[('x', [1, 3])])}) + actual = concat([ds1, ds2], 'y') + expected = Dataset({'foo': DataArray([[1, 2, np.nan], [1, np.nan, 2]], + dims=['y', 'x'], + coords={'x': [1, 2, 3]})}) + assert_identical(expected, actual) + + def test_concat_errors(self): + data = create_test_data() + split_data = [data.isel(dim1=slice(3)), + data.isel(dim1=slice(3, None))] + + with raises_regex(ValueError, 'must supply at least one'): + concat([], 'dim1') + + with raises_regex(ValueError, 'are not coordinates'): + concat([data, data], 'new_dim', coords=['not_found']) + + with raises_regex(ValueError, 'global attributes not'): + data0, data1 = deepcopy(split_data) + data1.attrs['foo'] = 'bar' + concat([data0, data1], 'dim1', compat='identical') + assert_identical( + data, concat([data0, data1], 'dim1', compat='equals')) + + with raises_regex(ValueError, 'encountered unexpected'): + data0, data1 = deepcopy(split_data) + data1['foo'] = ('bar', np.random.randn(10)) + concat([data0, data1], 'dim1') + + with raises_regex(ValueError, 'compat.* invalid'): + concat(split_data, 'dim1', compat='foobar') + + with raises_regex(ValueError, 'unexpected value for'): + concat([data, data], 'new_dim', coords='foobar') + + with raises_regex( + ValueError, 'coordinate in some datasets but not others'): + concat([Dataset({'x': 0}), Dataset({'x': [1]})], dim='z') + + with raises_regex( + ValueError, 'coordinate in some datasets but not others'): + concat([Dataset({'x': 0}), Dataset({}, {'x': 1})], dim='z') + + with raises_regex(ValueError, 'no longer a valid'): + concat([data, data], 'new_dim', mode='different') + with raises_regex(ValueError, 'no longer a valid'): + concat([data, data], 'new_dim', concat_over='different') + + def test_concat_promote_shape(self): + # mixed dims within variables + objs = [Dataset({}, {'x': 0}), Dataset({'x': [1]})] + actual = concat(objs, 'x') + expected = Dataset({'x': [0, 1]}) + assert_identical(actual, expected) + + objs = [Dataset({'x': [0]}), Dataset({}, {'x': 1})] + actual = concat(objs, 'x') + assert_identical(actual, expected) + + # mixed dims between variables + objs = [Dataset({'x': [2], 'y': 3}), Dataset({'x': [4], 'y': 5})] + actual = concat(objs, 'x') + expected = Dataset({'x': [2, 4], 'y': ('x', [3, 5])}) + assert_identical(actual, expected) + + # mixed dims in coord variable + objs = [Dataset({'x': [0]}, {'y': -1}), + Dataset({'x': [1]}, {'y': ('x', [-2])})] + actual = concat(objs, 'x') + expected = Dataset({'x': [0, 1]}, {'y': ('x', [-1, -2])}) + assert_identical(actual, expected) + + # scalars with mixed lengths along concat dim -- values should repeat + objs = [Dataset({'x': [0]}, {'y': -1}), + Dataset({'x': [1, 2]}, {'y': -2})] + actual = concat(objs, 'x') + expected = Dataset({'x': [0, 1, 2]}, {'y': ('x', [-1, -2, -2])}) + assert_identical(actual, expected) + + # broadcast 1d x 1d -> 2d + objs = [Dataset({'z': ('x', [-1])}, {'x': [0], 'y': [0]}), + Dataset({'z': ('y', [1])}, {'x': [1], 'y': [0]})] + actual = concat(objs, 'x') + expected = Dataset({'z': (('x', 'y'), [[-1], [1]])}, + {'x': [0, 1], 'y': [0]}) + assert_identical(actual, expected) + + def test_concat_do_not_promote(self): + # GH438 + objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), + Dataset({'y': ('t', [2])}, {'x': 1, 't': [0]})] + expected = Dataset({'y': ('t', [1, 2])}, {'x': 1, 't': [0, 0]}) + actual = concat(objs, 't') + assert_identical(expected, actual) + + objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), + Dataset({'y': ('t', [2])}, {'x': 2, 't': [0]})] + with pytest.raises(ValueError): + concat(objs, 't', coords='minimal') + + def test_concat_dim_is_variable(self): + objs = [Dataset({'x': 0}), Dataset({'x': 1})] + coord = Variable('y', [3, 4]) + expected = Dataset({'x': ('y', [0, 1]), 'y': [3, 4]}) + actual = concat(objs, coord) + assert_identical(actual, expected) + + def test_concat_multiindex(self): + x = pd.MultiIndex.from_product([[1, 2, 3], ['a', 'b']]) + expected = Dataset({'x': x}) + actual = concat([expected.isel(x=slice(2)), + expected.isel(x=slice(2, None))], 'x') + assert expected.equals(actual) + assert isinstance(actual.x.to_index(), pd.MultiIndex) + + +class TestConcatDataArray(object): + def test_concat(self): + ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), + 'bar': (['x', 'y'], np.random.random((2, 3)))}, + {'x': [0, 1]}) + foo = ds['foo'] + bar = ds['bar'] + + # from dataset array: + expected = DataArray(np.array([foo.values, bar.values]), + dims=['w', 'x', 'y'], coords={'x': [0, 1]}) + actual = concat([foo, bar], 'w') + assert_equal(expected, actual) + # from iteration: + grouped = [g for _, g in foo.groupby('x')] + stacked = concat(grouped, ds['x']) + assert_identical(foo, stacked) + # with an index as the 'dim' argument + stacked = concat(grouped, ds.indexes['x']) + assert_identical(foo, stacked) + + actual = concat([foo[0], foo[1]], pd.Index([0, 1]) + ).reset_coords(drop=True) + expected = foo[:2].rename({'x': 'concat_dim'}) + assert_identical(expected, actual) + + actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True) + expected = foo[:2].rename({'x': 'concat_dim'}) + assert_identical(expected, actual) + + with raises_regex(ValueError, 'not identical'): + concat([foo, bar], dim='w', compat='identical') + + with raises_regex(ValueError, 'not a valid argument'): + concat([foo, bar], dim='w', data_vars='minimal') + + def test_concat_encoding(self): + # Regression test for GH1297 + ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), + 'bar': (['x', 'y'], np.random.random((2, 3)))}, + {'x': [0, 1]}) + foo = ds['foo'] + foo.encoding = {"complevel": 5} + ds.encoding = {"unlimited_dims": 'x'} + assert concat([foo, foo], dim="x").encoding == foo.encoding + assert concat([ds, ds], dim="x").encoding == ds.encoding + + @requires_dask + def test_concat_lazy(self): + import dask.array as da + + arrays = [DataArray( + da.from_array(InaccessibleArray(np.zeros((3, 3))), 3), + dims=['x', 'y']) for _ in range(2)] + # should not raise + combined = concat(arrays, dim='z') + assert combined.shape == (2, 3, 3) + assert combined.dims == ('z', 'x', 'y') From 97e508c5db7eb0ef11727f6714221a13a56a6bc9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 7 Jan 2019 01:00:23 +0000 Subject: [PATCH 50/96] Some PEP8 fixes --- xarray/core/combine.py | 18 ++++++++++-------- xarray/tests/test_combine.py | 25 ++++++++++++------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 7058232e707..f10676c4474 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -89,7 +89,8 @@ def _infer_concat_order_from_coords(datasets): # Infer order datasets should be arranged in along this dim concat_dims.append(dim) - # TODO generalise this to deduce whether coord should be monotonically increasing or decreasing + # TODO generalise this to deduce whether coord should be + # monotonically increasing or decreasing if not all(pd.Index(coord).is_monotonic_increasing for coord in coord_vals): raise ValueError(f"Coordinate variable {dim} is not " @@ -97,8 +98,8 @@ def _infer_concat_order_from_coords(datasets): "datasets") # Sort datasets along dim - # Assume that any two datasets whose coord along dim starts with - # the same value have the exact same coord values throughout. + # Assume that any two datasets whose coord along dim starts + # with the same value have the same coord values throughout. first_coord_vals = [coord[0] for coord in coord_vals] new_positions = _infer_order_1d(first_coord_vals, method='dense') @@ -110,7 +111,8 @@ def _infer_concat_order_from_coords(datasets): tile_ids = [tile_id + (position,) for tile_id, position in zip(tile_ids, new_positions)] - # TODO check that this is still the correct logic for case of merging but no concatenation + # TODO check that this is still the correct logic for case of merging but + # no concatenation if len(datasets) > 1 and not concat_dims: raise ValueError("Could not find any dimension coordinates to use to " "order the datasets for concatenation") @@ -122,11 +124,11 @@ def _infer_concat_order_from_coords(datasets): def _all_arrays_equal(iterator): try: - iterator = iter(iterator) - first = next(iterator) - return all(np.array_equal(first, rest) for rest in iterator) + iterator = iter(iterator) + first = next(iterator) + return all(np.array_equal(first, rest) for rest in iterator) except StopIteration: - return True + return True def _infer_order_1d(arr, method='dense'): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 46b9a02431a..13beb3d695f 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -1,11 +1,9 @@ from __future__ import absolute_import, division, print_function -from copy import deepcopy from itertools import product import numpy as np import numpy.testing as npt -import pandas as pd import pytest from xarray import DataArray, Dataset, concat, auto_combine, manual_combine @@ -137,9 +135,9 @@ def test_datetimes(self): def test_all_arrays_equal(): - assert _all_arrays_equal([np.array([1,2,3]), - np.array([1,2,3]), - np.array([1,2,3])]) + assert _all_arrays_equal([np.array([1, 2, 3]), + np.array([1, 2, 3]), + np.array([1, 2, 3])]) assert not _all_arrays_equal([np.array([1, 2, 3]), np.array([1, 2, 3]), np.array([1, 2, 4])]) @@ -276,11 +274,10 @@ def test_concat_once(self, create_combined_ids, concat_dim): def test_concat_only_first_dim(self, create_combined_ids): shape = (2, 3) combined_ids = create_combined_ids(shape) - result = _combine_all_along_first_dim(combined_ids, - dim='dim1', - data_vars='all', - coords='different', - compat='no_conflicts') + result = _combine_all_along_first_dim(combined_ids, dim='dim1', + data_vars='all', + coords='different', + compat='no_conflicts') ds = create_test_data partway1 = concat([ds(0), ds(3)], dim='dim1') @@ -351,12 +348,13 @@ def test_manual_concat(self): with pytest.raises(KeyError): manual_combine(objs, concat_dim='x') - # TODO weird error from auto_concat on both of these when it tries to infer dimension? + # TODO weird error from auto_concat on both of these when it tries to infer + # dimension? @pytest.mark.xfail def test_manual_concat_too_many_dims_at_once(self): objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] with raises_regex(ValueError, 'too many .* dimensions'): - manual_combine(objs) + manual_combine(objs) objs = [Dataset({'x': 0}), Dataset({'x': 1})] with raises_regex(ValueError, 'cannot infer dimension'): @@ -533,7 +531,8 @@ def test_auto_combine(self): objs = [Dataset({'x': ('a', [0]), 'y': ('a', [0]), 'a': [0]}), Dataset({'x': ('a', [1]), 'y': ('a', [1]), 'a': [1]})] actual = auto_combine(objs) - expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1]), 'a': [0, 1]}) + expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1]), + 'a': [0, 1]}) assert_identical(expected, actual) # TODO check this is the desired behaviour From 410b138db5ea0343e58b0b33adeb8cb857f514c5 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 7 Jan 2019 01:10:37 +0000 Subject: [PATCH 51/96] Pre-empting a test which will fail with opening uamiv format --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ddadb0773f1..2aab0be9a35 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2777,7 +2777,8 @@ def test_uamiv_format_mfread(self): ['example.uamiv', 'example.uamiv'], engine='pseudonetcdf', - concat_dims=['TSTEP'], + combine='manual', + concat_dim=['TSTEP'], backend_kwargs={'format': 'uamiv'}) data1 = np.arange(20, dtype='f').reshape(1, 1, 4, 5) From 02b6d05eb6b62a4627418563b8253ee0d2a935c8 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 7 Jan 2019 01:12:31 +0000 Subject: [PATCH 52/96] Satisfy pep8speaks bot --- xarray/tests/test_combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 13beb3d695f..f331ea73c3a 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -113,7 +113,7 @@ class TestInferOrder1D(object): def test_arrays(self): npt.assert_equal(_infer_order_1d([3, 1, 2, 7]), np.array([2, 0, 1, 3])) npt.assert_equal(_infer_order_1d([5, 7, 8, 8]), np.array([0, 1, 2, 2])) - npt.assert_equal(_infer_order_1d([2, 2, 0]), np.array([1, 1, 0])) + npt.assert_equal(_infer_order_1d([2, 2, 0]), np.array([1, 1, 0])) npt.assert_equal(_infer_order_1d([2, 5, 5, 1]), np.array([1, 2, 2, 0])) @pytest.mark.xfail From 0d6f13a5a6b2d217717b2e599b496b744fefda88 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 7 Jan 2019 10:48:19 +0000 Subject: [PATCH 53/96] Python 3.5 compatibile after changing some error string formatting --- xarray/core/combine.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index f10676c4474..c9db2750653 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -27,11 +27,10 @@ def _infer_concat_order_from_positions(datasets, concat_dims): if concat_dims is _CONCAT_DIM_DEFAULT: concat_dims = [_CONCAT_DIM_DEFAULT] * n_dims else: - print(concat_dims) if len(concat_dims) != n_dims: - raise ValueError("concat_dims has length " + str(len(concat_dims)) - + " but the datasets passed are nested in a " + - str(n_dims) + "-dimensional structure") + raise ValueError("concat_dims has length {} but the datasets " + "passed are nested in a {}-dimensional structure" + .format(str(len(concat_dims)), str(n_dims))) return combined_ids, concat_dims @@ -93,9 +92,9 @@ def _infer_concat_order_from_coords(datasets): # monotonically increasing or decreasing if not all(pd.Index(coord).is_monotonic_increasing for coord in coord_vals): - raise ValueError(f"Coordinate variable {dim} is not " + raise ValueError("Coordinate variable {} is not " "monotonically increasing on all " - "datasets") + "datasets".format(dim)) # Sort datasets along dim # Assume that any two datasets whose coord along dim starts From 18e00746100b1f51cc76205c5b9f732a2ccbc384 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 7 Jan 2019 11:57:48 +0000 Subject: [PATCH 54/96] Order coords using pandas.Index objects --- xarray/core/combine.py | 56 +++++++++++--------------- xarray/tests/test_combine.py | 77 +++++++++++++++--------------------- 2 files changed, 54 insertions(+), 79 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index c9db2750653..9021d7dc9ef 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -5,8 +5,6 @@ import pandas as pd -import numpy as np - from .dataarray import DataArray from . import utils from .merge import merge @@ -79,39 +77,50 @@ def _infer_concat_order_from_coords(datasets): if dim in ds0: # Need to read coordinate values to do ordering - coord_vals = [ds[dim].values for ds in datasets] + indexes = [ds.indexes.get(dim) for ds in datasets] + if any(index is None for index in indexes): + raise ValueError("Every dimension needs a coordinate for " + "inferring concatenation order") # If dimension coordinate values are same on every dataset then # should be leaving this dimension alone (it's just a "bystander") - if not _all_arrays_equal(coord_vals): + if not all(index.equals(indexes[0]) for index in indexes[1:]): # Infer order datasets should be arranged in along this dim concat_dims.append(dim) # TODO generalise this to deduce whether coord should be # monotonically increasing or decreasing - if not all(pd.Index(coord).is_monotonic_increasing - for coord in coord_vals): + if not all(index.is_monotonic_increasing for index in indexes): raise ValueError("Coordinate variable {} is not " "monotonically increasing on all " "datasets".format(dim)) - # Sort datasets along dim # Assume that any two datasets whose coord along dim starts # with the same value have the same coord values throughout. - first_coord_vals = [coord[0] for coord in coord_vals] - new_positions = _infer_order_1d(first_coord_vals, - method='dense') + try: + first_items = pd.Index([index.take([0]) + for index in indexes]) + except IndexError: + raise ValueError('Cannot handle size zero dimensions') + + # TODO This seems to work for strings and datetime objects too + # but is that guaranteed pandas behaviour? + + # Sort datasets along dim + # We want rank but with identical elements given identical + # position indices - they should be concatenated along another + # dimension, not along this one + order = first_items.to_series().rank(method='dense').astype( + int).values - 1 # TODO check that resulting global coordinate is monotonic # Append positions along extra dimension to structure which # encodes the multi-dimensional concatentation order tile_ids = [tile_id + (position,) for tile_id, position - in zip(tile_ids, new_positions)] + in zip(tile_ids, order)] - # TODO check that this is still the correct logic for case of merging but - # no concatenation if len(datasets) > 1 and not concat_dims: raise ValueError("Could not find any dimension coordinates to use to " "order the datasets for concatenation") @@ -121,27 +130,6 @@ def _infer_concat_order_from_coords(datasets): return combined_ids, concat_dims -def _all_arrays_equal(iterator): - try: - iterator = iter(iterator) - first = next(iterator) - return all(np.array_equal(first, rest) for rest in iterator) - except StopIteration: - return True - - -def _infer_order_1d(arr, method='dense'): - # TODO Special cases for string coords - natural sorting instead? - # TODO sort datetime coords too - arr = np.array(arr) - - # We want rank but with identical elements given identical position indices - # - they should be concatenated along another dimension, not along this one - ranks = pd.Series(arr).rank(method=method).values - - return ranks.astype('int') - 1 - - def _check_shape_tile_ids(combined_tile_ids): tile_ids = combined_tile_ids.keys() diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index f331ea73c3a..df0b167fb63 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function from itertools import product +from datetime import datetime import numpy as np import numpy.testing as npt @@ -8,10 +9,9 @@ from xarray import DataArray, Dataset, concat, auto_combine, manual_combine from xarray.core.combine import ( - _new_tile_id, _check_shape_tile_ids, - _combine_all_along_first_dim, _combine_nd, _all_arrays_equal, - _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, - _infer_concat_order_from_coords, _infer_order_1d) + _new_tile_id, _check_shape_tile_ids, _combine_all_along_first_dim, + _combine_nd, _infer_concat_order_from_positions, + _infer_tile_ids_from_nested_list, _infer_concat_order_from_coords,) from xarray.core.pycompat import OrderedDict from . import (assert_combined_tile_ids_equal, assert_identical, assert_equal, @@ -109,40 +109,6 @@ def test_infer_from_datasets(self): _infer_concat_order_from_positions(input, ['dim1', 'extra_dim']) -class TestInferOrder1D(object): - def test_arrays(self): - npt.assert_equal(_infer_order_1d([3, 1, 2, 7]), np.array([2, 0, 1, 3])) - npt.assert_equal(_infer_order_1d([5, 7, 8, 8]), np.array([0, 1, 2, 2])) - npt.assert_equal(_infer_order_1d([2, 2, 0]), np.array([1, 1, 0])) - npt.assert_equal(_infer_order_1d([2, 5, 5, 1]), np.array([1, 2, 2, 0])) - - @pytest.mark.xfail - def test_strings(self): - npt.assert_equal(_infer_order_1d(['b', 'a']), np.array([1, 0])) - npt.assert_equal(_infer_order_1d(['aa', 'a']), np.array([1, 0])) - npt.assert_equal(_infer_order_1d(['c1', 'c0']), np.array([1, 0])) - - npt.assert_equal(_infer_order_1d(['c1', 'c0', 'c0']), - np.array([1, 0, 0])) - - # Natural sorting - npt.assert_equal(_infer_order_1d(['c1', 'c0', 'c10']), - np.array([1, 0, 2])) - - @pytest.mark.skip - def test_datetimes(self): - pass - - -def test_all_arrays_equal(): - assert _all_arrays_equal([np.array([1, 2, 3]), - np.array([1, 2, 3]), - np.array([1, 2, 3])]) - assert not _all_arrays_equal([np.array([1, 2, 3]), - np.array([1, 2, 3]), - np.array([1, 2, 4])]) - - class TestTileIDsFromCoords(object): def test_1d(self): ds0 = Dataset({'x': [0, 1]}) @@ -212,13 +178,34 @@ def test_2d_plus_bystander_dim(self): assert_combined_tile_ids_equal(expected, actual) assert concat_dims == ['x', 'y'] - @pytest.mark.skip - def test_string_coord(self): - pass + def test_string_coords(self): + ds0 = Dataset({'person': ['Alice', 'Bob']}) + ds1 = Dataset({'person': ['Caroline', 'Daniel']}) + + expected = {(0,): ds0, (1,): ds1} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['person'] + + # TODO decide if natural sorting of string coords is desired + @pytest.mark.xfail + def test_natural_sort_string_coords(self): + ds0 = Dataset({'simulation': ['run8', 'run9']}) + ds1 = Dataset({'simulation': ['run10', 'run11']}) - @pytest.mark.skip - def test_datetime_coord(self): - pass + expected = {(0,): ds0, (1,): ds1} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['simulation'] + + def test_datetime_coords(self): + ds0 = Dataset({'time': [datetime(2000, 3, 6), datetime(2001, 3, 7)]}) + ds1 = Dataset({'time': [datetime(1999, 1, 1), datetime(1999, 2, 4)]}) + + expected = {(0,): ds1, (1,): ds0} + actual, concat_dims = _infer_concat_order_from_coords([ds0, ds1]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['time'] @pytest.fixture(scope='module') @@ -547,7 +534,7 @@ def test_auto_combine(self): auto_combine(objs) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] - with pytest.raises(KeyError): + with raises_regex(ValueError, 'Every dimension needs a coordinate'): auto_combine(objs) def test_infer_order_from_coords(self): From 67f11f3f4b7c30e521e8363cdbfc0c76a4df3677 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 15 Jan 2019 14:38:33 +0000 Subject: [PATCH 55/96] Fixed performance bug from GH #2662 --- xarray/core/combine.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 9021d7dc9ef..2dbc59d8e7d 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -417,14 +417,16 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', """ # Group by data vars - grouped = itertools.groupby(datasets, key=lambda ds: tuple(sorted(ds))) + sorted_datasets = sorted(datasets, key=lambda ds: tuple(sorted(ds))) + grouped_by_vars = itertools.groupby(sorted_datasets, + key=lambda ds: tuple(sorted(ds))) # Perform the multidimensional combine on each group of data variables # before merging back together concatenated_grouped_by_data_vars = [] - for vars, datasets in grouped: + for vars, datasets_with_same_vars in grouped_by_vars: combined_ids, concat_dims = _infer_concat_order_from_coords( - list(datasets)) + list(datasets_with_same_vars)) # TODO checking the shape of the combined ids appropriate here? _check_shape_tile_ids(combined_ids) From 3b843f548d38fb67e6cea92013106bb11f476657 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 23 Jan 2019 19:17:59 +0000 Subject: [PATCH 56/96] Removed ToDos about natural sorting of string coords --- xarray/tests/test_combine.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index df0b167fb63..c0326a31b22 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -187,13 +187,12 @@ def test_string_coords(self): assert_combined_tile_ids_equal(expected, actual) assert concat_dims == ['person'] - # TODO decide if natural sorting of string coords is desired - @pytest.mark.xfail - def test_natural_sort_string_coords(self): + # Decided against natural sorting of string coords GH #2616 + def test_lexicographic_sort_string_coords(self): ds0 = Dataset({'simulation': ['run8', 'run9']}) ds1 = Dataset({'simulation': ['run10', 'run11']}) - expected = {(0,): ds0, (1,): ds1} + expected = {(0,): ds1, (1,): ds0} actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0]) assert_combined_tile_ids_equal(expected, actual) assert concat_dims == ['simulation'] @@ -522,7 +521,6 @@ def test_auto_combine(self): 'a': [0, 1]}) assert_identical(expected, actual) - # TODO check this is the desired behaviour objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] actual = auto_combine(objs) expected = Dataset({'x': [0, 1], 'y': [0, 1]}) From bb98d548d1ff860b533d28385424e50d7970f57b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 24 Jan 2019 10:44:01 +0000 Subject: [PATCH 57/96] Generalized auto_combine to handle monotonically-decreasing coords too --- xarray/core/combine.py | 23 ++++++++++++----------- xarray/tests/test_combine.py | 15 ++++++++++++--- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 54925506dad..775d9258ee2 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -89,12 +89,15 @@ def _infer_concat_order_from_coords(datasets): # Infer order datasets should be arranged in along this dim concat_dims.append(dim) - # TODO generalise this to deduce whether coord should be - # monotonically increasing or decreasing - if not all(index.is_monotonic_increasing for index in indexes): - raise ValueError("Coordinate variable {} is not " - "monotonically increasing on all " - "datasets".format(dim)) + if all(index.is_monotonic_increasing for index in indexes): + ascending = True + elif all(index.is_monotonic_decreasing for index in indexes): + ascending = False + else: + raise ValueError("Coordinate variable {} is neither " + "monotonically increasing nor " + "monotonically decreasing on all datasets" + .format(dim)) # Assume that any two datasets whose coord along dim starts # with the same value have the same coord values throughout. @@ -104,15 +107,13 @@ def _infer_concat_order_from_coords(datasets): except IndexError: raise ValueError('Cannot handle size zero dimensions') - # TODO This seems to work for strings and datetime objects too - # but is that guaranteed pandas behaviour? - # Sort datasets along dim # We want rank but with identical elements given identical # position indices - they should be concatenated along another # dimension, not along this one - order = first_items.to_series().rank(method='dense').astype( - int).values - 1 + series = first_items.to_series() + rank = series.rank(method='dense', ascending=ascending) + order = rank.astype(int).values - 1 # TODO check that resulting global coordinate is monotonic diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 9a108fb8c17..c46745d0330 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -4,7 +4,6 @@ from datetime import datetime import numpy as np -import numpy.testing as npt import pytest from xarray import DataArray, Dataset, concat, auto_combine, manual_combine @@ -144,10 +143,19 @@ def test_no_dimension_coords(self): def test_coord_not_monotonic(self): ds0 = Dataset({'x': [0, 1]}) ds1 = Dataset({'x': [3, 2]}) - with raises_regex(ValueError, "Coordinate variable x is not " - "monotonically increasing"): + with raises_regex(ValueError, "Coordinate variable x is neither " + "monotonically increasing nor"): _infer_concat_order_from_coords([ds1, ds0]) + def test_coord_monotonically_decreasing(self): + ds0 = Dataset({'x': [3, 2]}) + ds1 = Dataset({'x': [1, 0]}) + + expected = {(0,): ds0, (1,): ds1} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['x'] + # TODO implement this error message @pytest.mark.xfail def test_check_for_impossible_ordering(self): @@ -572,6 +580,7 @@ def test_auto_combine_no_concat(self): expected = Dataset({'x': 0, 'y': 1, 'z': 2}) assert_identical(expected, actual) + # TODO decide if this test should be rewritten @pytest.mark.xfail def test_internal_ordering(self): # This gives a MergeError if _auto_combine_1d is not sorting by From e3f7523ad3457bd95cdeaf6c6f0a0f58d64e404b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 28 Jan 2019 11:24:16 +0000 Subject: [PATCH 58/96] Added more examples to docstring for manual_combine --- xarray/core/combine.py | 54 +++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 775d9258ee2..25e703cf4c3 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -218,11 +218,6 @@ def _combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, # Should it just use concat directly instead? if concat_dim is not None: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - - - print(dim) - print(datasets) - combined = _auto_concat(datasets, dim=dim, data_vars=data_vars, coords=coords) else: @@ -290,6 +285,8 @@ def manual_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=[..., None, ...]`` explicitly to disable concatenation and merge instead along a particular dimension. + The position of ``None`` in the list specifies the dimension of the + nested-list input along which to merge. Must be the same length as the depth of the list passed to ``datasets``. compat : {'identical', 'equals', 'broadcast_equals', @@ -317,35 +314,58 @@ def manual_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, Examples -------- - Collecting output from a parallel simulation: - - Collecting data from a simulation which decomposes its domain into 4 parts, - 2 each along both the x and y axes, requires organising the datasets into a - nested list, e.g. + A common task is collecting data from a parallelized simulation in which + each processor wrote out to a separate file. A domain which was decomposed + into 4 parts, 2 each along both the x and y axes, requires organising the + datasets into a doubly-nested list, e.g: >>> x1y1 Dimensions: (x: 2, y: 2) - Coordinates: - lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 - lat (x, y) float64 42.25 42.21 42.63 42.59 Dimensions without coordinates: x, y Data variables: temperature (x, y) float64 11.04 23.57 20.77 ... precipitation (x, y) float64 5.904 2.453 3.404 ... >>> ds_grid = [[x1y1, x1y2], [x2y1, x2y2]] - >>> combined = xr.auto_combine(ds_grid, concat_dims=['x', 'y']) + >>> combined = xr.manual_combine(ds_grid, concat_dim=['x', 'y']) Dimensions: (x: 4, y: 4) - Coordinates: - lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 - lat (x, y) float64 42.25 42.21 42.63 42.59 Dimensions without coordinates: x, y Data variables: temperature (x, y) float64 11.04 23.57 20.77 ... precipitation (x, y) float64 5.904 2.453 3.404 ... + + ``manual_combine`` can also be used to explicitly merge datasets with + different variables. For example if we have 4 datasets, which are divided + along two times, and contain two different variables, we can pass ``None`` + to ``concat_dim`` to specify the dimension of the nested list over which + we wish to use ``merge`` instead of ``concat``: + + >>> t1temp + + Dimensions: (t: 5) + Dimensions without coordinates: t + Data variables: + temperature (t) float64 11.04 23.57 20.77 ... + + >>> t1precip + + Dimensions: (t: 5) + Dimensions without coordinates: t + Data variables: + precipitation (t) float64 5.904 2.453 3.404 ... + + >>> ds_grid = [[t1temp, t1precip], [t2temp, t2precip]] + >>> combined = xr.manual_combine(ds_grid, concat_dim=['t', None]) + + Dimensions: (t: 10) + Dimensions without coordinates: t + Data variables: + temperature (t) float64 11.04 23.57 20.77 ... + precipitation (t) float64 5.904 2.453 3.404 ... + See also -------- concat From d96595e455337258b7e6af8e4429ada89f211abc Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 28 Jan 2019 11:49:10 +0000 Subject: [PATCH 59/96] Added note about globbing aspect of open_mfdataset --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 33928d105b6..33d69ae1e31 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -504,7 +504,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, list of files to open. Paths can be given as strings or as pathlib Paths. If concatenation along more than one dimension is desired, then ``paths`` must be a nested list-of-lists (see ``manual_combine`` for - details). + details). (A string glob will be expanded to a 1-dimensional list.) chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. From 79f09c01f7152df29cf15b89ce91df1ce16be9e8 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 28 Jan 2019 16:28:59 +0000 Subject: [PATCH 60/96] Removed auto-inferring of concatenation dimension in manual_combine --- xarray/backends/api.py | 20 ++++++------ xarray/core/combine.py | 63 ++++++++++++++---------------------- xarray/core/concat.py | 26 --------------- xarray/tests/test_combine.py | 25 ++++++-------- 4 files changed, 44 insertions(+), 90 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 33d69ae1e31..3b7060be6de 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,9 +10,8 @@ from .. import Dataset, DataArray, backends, conventions from ..core import indexing from .. import auto_combine -from ..core.combine import (_manual_combine, _CONCAT_DIM_DEFAULT, - _infer_concat_order_from_positions) -from ..core.utils import (close_on_error, is_grib_path, is_remote_uri) +from ..core.combine import _manual_combine, _infer_concat_order_from_positions +from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import ArrayWriter from .locks import _get_scheduler @@ -483,7 +482,7 @@ def close(self): f.close() -def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, +def open_mfdataset(paths, chunks=None, concat_dim='__auto_combine__', compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', combine='auto', autoclose=None, parallel=False, **kwargs): @@ -517,8 +516,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, need to provide this argument if any of the dimensions along which you want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. - By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dim=[..., None, ...]`` explicitly to + Set ``concat_dim=[..., None, ...]`` explicitly to disable concatenation along a particular dimension. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional @@ -613,11 +611,13 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, # If combine='manual' then this creates a flat list which is easier to # iterate over, while saving the originally-supplied structure as "ids" if combine is 'manual': - if concat_dim is not _CONCAT_DIM_DEFAULT: + if concat_dim is '__auto_combine__': + raise ValueError("Must supply concat_dim when using manual " + "combine") + else: if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: concat_dim = [concat_dim] - combined_ids_paths, concat_dims = _infer_concat_order_from_positions( - paths, concat_dim) + combined_ids_paths = _infer_concat_order_from_positions(paths) ids, paths = ( list(combined_ids_paths.keys()), list(combined_ids_paths.values())) @@ -650,7 +650,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, if combine is 'auto': # Will redo ordering from coordinates, ignoring how they were # ordered previously - if concat_dim is not _CONCAT_DIM_DEFAULT: + if concat_dim is not '__auto_combine__': raise ValueError("Cannot specify dimensions to concatenate " "along when auto-combining") diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 5a559905c93..45466330ec3 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -9,28 +9,12 @@ from .merge import merge from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars -from .concat import _auto_concat +from .concat import concat -_CONCAT_DIM_DEFAULT = utils.ReprObject('') - - -def _infer_concat_order_from_positions(datasets, concat_dims): - +def _infer_concat_order_from_positions(datasets): combined_ids = OrderedDict(_infer_tile_ids_from_nested_list(datasets, ())) - - tile_id, ds = list(combined_ids.items())[0] - n_dims = len(tile_id) - - if concat_dims is _CONCAT_DIM_DEFAULT: - concat_dims = [_CONCAT_DIM_DEFAULT] * n_dims - else: - if len(concat_dims) != n_dims: - raise ValueError("concat_dims has length {} but the datasets " - "passed are nested in a {}-dimensional structure" - .format(str(len(concat_dims)), str(n_dims))) - - return combined_ids, concat_dims + return combined_ids def _infer_tile_ids_from_nested_list(entry, current_pos): @@ -46,7 +30,7 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): Parameters ---------- - entry : list[list[obj, obj, ...]] + entry : list[list[obj, obj, ...], ...] List of lists of arbitrary depth, containing objects in the order they are to be concatenated. @@ -176,6 +160,14 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', combined_ds : xarray.Dataset """ + tile_id, ds = list(combined_ids.items())[0] + + n_dims = len(tile_id) + if len(concat_dims) != n_dims: + raise ValueError("concat_dims has length {} but the datasets " + "passed are nested in a {}-dimensional structure" + .format(str(len(concat_dims)), str(n_dims))) + # Each iteration of this loop reduces the length of the tile_ids tuples # by one. It always combines along the first dimension, removing the first # element of the tuple @@ -207,19 +199,16 @@ def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat): return new_combined_ids -def _combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', data_vars='all', coords='different'): +def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all', + coords='different'): """ Applies either concat or merge to 1D list of datasets depending on value of concat_dim """ - # TODO this logic is taken from old 1D auto_combine - check if it's right - # Should it just use concat directly instead? if concat_dim is not None: - dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - combined = _auto_concat(datasets, dim=dim, data_vars=data_vars, - coords=coords) + combined = concat(datasets, dim=concat_dim, data_vars=data_vars, + coords=coords) else: combined = merge(datasets, compat=compat) @@ -238,8 +227,7 @@ def _manual_combine(datasets, concat_dims, compat, data_vars, coords, ids): if not ids: # Determine tile_IDs by structure of input in N-D # (i.e. ordering in list-of-lists) - combined_ids, concat_dims = _infer_concat_order_from_positions( - datasets, concat_dims) + combined_ids = _infer_concat_order_from_positions(datasets) else: # Already sorted so just use the ids already passed combined_ids = OrderedDict(zip(ids, datasets)) @@ -253,8 +241,8 @@ def _manual_combine(datasets, concat_dims, compat, data_vars, coords, ids): return combined -def manual_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', data_vars='all', coords='different'): +def manual_combine(datasets, concat_dim, compat='no_conflicts', + data_vars='all', coords='different'): """ Explicitly combine an N-dimensional grid of datasets into one by using a succession of concat and merge operations along each dimension of the grid. @@ -279,12 +267,11 @@ def manual_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, Dataset objects to combine. If concatenation or merging along more than one dimension is desired, then datasets must be supplied in a nested list-of-lists. - concat_dim : str, or list of str, DataArray, Index or None, optional + concat_dim : str, or list of str, DataArray, Index or None Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. - By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dim=[..., None, ...]`` explicitly to - disable concatenation and merge instead along a particular dimension. + Set ``concat_dim=[..., None, ...]`` explicitly to disable concatenation + and merge instead along a particular dimension. The position of ``None`` in the list specifies the dimension of the nested-list input along which to merge. Must be the same length as the depth of the list passed to @@ -372,10 +359,8 @@ def manual_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, merge auto_combine """ - - if concat_dim is not _CONCAT_DIM_DEFAULT: - if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: - concat_dim = [concat_dim] + if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: + concat_dim = [concat_dim] # The IDs argument tells _manual_combine that datasets aren't yet sorted return _manual_combine(datasets, concat_dims=concat_dim, compat=compat, diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 48f09b59bca..d38bf53136b 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -338,29 +338,3 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat, ds = _dataset_concat(datasets, dim, data_vars, coords, compat, positions) return arrays[0]._from_temp_dataset(ds, name) - - -def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): - if len(datasets) == 1 and dim is None: - # There is nothing more to combine, so kick out early. - return datasets[0] - else: - if dim is None: - ds0 = datasets[0] - ds1 = datasets[1] - concat_dims = set(ds0.dims) - if ds0.dims != ds1.dims: - dim_tuples = set(ds0.dims.items()) - set(ds1.dims.items()) - concat_dims = set(i for i, _ in dim_tuples) - if len(concat_dims) > 1: - concat_dims = set(d for d in concat_dims - if not ds0[d].equals(ds1[d])) - if len(concat_dims) > 1: - raise ValueError('too many different dimensions to ' - 'concatenate: %s' % concat_dims) - elif len(concat_dims) == 0: - raise ValueError('cannot infer dimension to concatenate: ' - 'supply the ``concat_dim`` argument ' - 'explicitly') - dim, = concat_dims - return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 01867572a49..fd784ba7ce3 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -11,7 +11,7 @@ from xarray.core.combine import ( _new_tile_id, _check_shape_tile_ids, _combine_all_along_first_dim, _combine_nd, _infer_concat_order_from_positions, - _infer_tile_ids_from_nested_list, _infer_concat_order_from_coords,) + _infer_concat_order_from_coords) from . import (assert_combined_tile_ids_equal, assert_identical, assert_equal, raises_regex) @@ -24,7 +24,7 @@ def test_1d(self): input = [ds(0), ds(1)] expected = {(0,): ds(0), (1,): ds(1)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_2d(self): @@ -34,7 +34,7 @@ def test_2d(self): expected = {(0, 0): ds(0), (0, 1): ds(1), (1, 0): ds(2), (1, 1): ds(3), (2, 0): ds(4), (2, 1): ds(5)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_3d(self): @@ -48,7 +48,7 @@ def test_3d(self): (1, 0, 0): ds(6), (1, 0, 1): ds(7), (1, 1, 0): ds(8), (1, 1, 1): ds(9), (1, 2, 0): ds(10), (1, 2, 1): ds(11)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_single_dataset(self): @@ -56,7 +56,7 @@ def test_single_dataset(self): input = [ds] expected = {(0,): ds} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_redundant_nesting(self): @@ -64,14 +64,14 @@ def test_redundant_nesting(self): input = [[ds(0)], [ds(1)]] expected = {(0, 0): ds(0), (1, 0): ds(1)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_ignore_empty_list(self): ds = create_test_data(0) input = [ds, []] expected = {(0,): ds} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_uneven_depth_input(self): @@ -81,7 +81,7 @@ def test_uneven_depth_input(self): input = [ds(0), [ds(1), ds(2)]] expected = {(0,): ds(0), (1, 0): ds(1), (1, 1): ds(2)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_uneven_length_input(self): @@ -91,7 +91,7 @@ def test_uneven_length_input(self): input = [[ds(0)], [ds(1), ds(2)]] expected = {(0, 0): ds(0), (1, 0): ds(1), (1, 1): ds(2)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_infer_from_datasets(self): @@ -99,14 +99,9 @@ def test_infer_from_datasets(self): input = [ds(0), ds(1)] expected = {(0,): ds(0), (1,): ds(1)} - actual, concat_dims = _infer_concat_order_from_positions(input, [ - 'dim1']) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) - input = [ds(0), ds(1)] - with pytest.raises(ValueError): - _infer_concat_order_from_positions(input, ['dim1', 'extra_dim']) - class TestTileIDsFromCoords(object): def test_1d(self): From e32adb367cdcd1d8128576d6c66dc5653d49a006 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 28 Jan 2019 17:17:26 +0000 Subject: [PATCH 61/96] Added example to docstring for auto_combine --- xarray/core/combine.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 45466330ec3..3ef1094a5b6 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -323,7 +323,6 @@ def manual_combine(datasets, concat_dim, compat='no_conflicts', temperature (x, y) float64 11.04 23.57 20.77 ... precipitation (x, y) float64 5.904 2.453 3.404 ... - ``manual_combine`` can also be used to explicitly merge datasets with different variables. For example if we have 4 datasets, which are divided along two times, and contain two different variables, we can pass ``None`` @@ -382,9 +381,8 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', a combination of concat and merge. Will attempt to order the datasets such that the values in their dimension - coordinates are monotonically increasing along all dimensions. If it cannot - determine the order in which to concatenate the datasets, it will raise an - error. + coordinates are monotonic along all dimensions. If it cannot determine the + order in which to concatenate the datasets, it will raise a ValueError. Non-coordinate dimensions will be ignored, as will any coordinate dimensions which do not vary between each dataset. @@ -396,7 +394,7 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', and each combination of a distinct time period and set of data variables is saved as its own dataset. Also useful for if you have a simulation which is parallelized in multiple dimensions, but has global coordinates saved in - each file specifying it's position within the domain. + each file specifying the positions of points within the global domain. Parameters ---------- @@ -429,6 +427,37 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', concat merge manual_combine + + Examples + -------- + + Combining two datasets using their common dimension coordinates. Notice + they are concatenated based on the values in their dimension coordinates, + not on their position in the list passed to `auto_combine`. + + >>> x1 + + Dimensions: (x: 3) + Coords: + position (x) int64 0 1 2 + Data variables: + temperature (x) float64 11.04 23.57 20.77 ... + + >>> x2 + + Dimensions: (x: 3) + Coords: + position (x) int64 3 4 5 + Data variables: + temperature (x) float64 6.97 8.13 7.42 ... + + >>> combined = xr.auto_combine([x2, x1]) + + Dimensions: (x: 6) + Coords: + position (x) int64 0 1 2 3 4 5 + Data variables: + temperature (x) float64 11.04 23.57 20.77 ... """ # Group by data vars From da4d60560236b57c1d756e3c2113185ac30a8e12 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 28 Jan 2019 17:27:24 +0000 Subject: [PATCH 62/96] Minor correction to docstring --- xarray/core/combine.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 3ef1094a5b6..e37db3ed5f3 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -439,25 +439,25 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', Dimensions: (x: 3) Coords: - position (x) int64 0 1 2 + * position (x) int64 0 1 2 Data variables: - temperature (x) float64 11.04 23.57 20.77 ... + temperature (x) float64 11.04 23.57 20.77 ... >>> x2 Dimensions: (x: 3) Coords: - position (x) int64 3 4 5 + * position (x) int64 3 4 5 Data variables: - temperature (x) float64 6.97 8.13 7.42 ... + temperature (x) float64 6.97 8.13 7.42 ... >>> combined = xr.auto_combine([x2, x1]) Dimensions: (x: 6) Coords: - position (x) int64 0 1 2 3 4 5 + * position (x) int64 0 1 2 3 4 5 Data variables: - temperature (x) float64 11.04 23.57 20.77 ... + temperature (x) float64 11.04 23.57 20.77 ... """ # Group by data vars From c4fe22c769dc74d44fcdc80df125347ecb903550 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 28 Jan 2019 17:29:26 +0000 Subject: [PATCH 63/96] Another very minor docstring correction --- xarray/core/combine.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index e37db3ed5f3..78b6dbbfe86 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -439,25 +439,25 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', Dimensions: (x: 3) Coords: - * position (x) int64 0 1 2 + * position (x) int64 0 1 2 Data variables: - temperature (x) float64 11.04 23.57 20.77 ... + temperature (x) float64 11.04 23.57 20.77 ... >>> x2 Dimensions: (x: 3) Coords: - * position (x) int64 3 4 5 + * position (x) int64 3 4 5 Data variables: - temperature (x) float64 6.97 8.13 7.42 ... + temperature (x) float64 6.97 8.13 7.42 ... >>> combined = xr.auto_combine([x2, x1]) Dimensions: (x: 6) Coords: - * position (x) int64 0 1 2 3 4 5 + * position (x) int64 0 1 2 3 4 5 Data variables: - temperature (x) float64 11.04 23.57 20.77 ... + temperature (x) float64 11.04 23.57 20.77 ... """ # Group by data vars From 66b4c4f9b11f722dc82695a052c3342b5b26f9ec Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 27 Feb 2019 17:13:49 +0000 Subject: [PATCH 64/96] Added test to guard against issue #2777 --- xarray/tests/test_combine.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index fd784ba7ce3..201d90db8de 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -411,6 +411,21 @@ def test_concat_multiple_dims(self): expected = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]])}) assert_identical(expected, actual) + def test_concat_name_symmetry(self): + """Inspired by the discussion on GH issue #2777""" + + da1 = DataArray(name='a', data=[[0]], dims=['x', 'y']) + da2 = DataArray(name='b', data=[[1]], dims=['x', 'y']) + da3 = DataArray(name='a', data=[[2]], dims=['x', 'y']) + da4 = DataArray(name='b', data=[[3]], dims=['x', 'y']) + + x_first = manual_combine([[da1, da2], [da3, da4]], + concat_dim=['x', 'y']) + y_first = manual_combine([[da1, da3], [da2, da4]], + concat_dim=['y', 'x']) + + assert_identical(x_first, y_first) + def test_concat_one_dim_merge_another(self): data = create_test_data() data1 = data.copy(deep=True) From 90f0c1da14336d8d1b6a78f203e75ceb0a048979 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 2 Mar 2019 15:54:32 +0000 Subject: [PATCH 65/96] Started deprecation cycle for auto_combine --- xarray/backends/api.py | 18 ++- xarray/core/combine.py | 209 +++++++++++++++++++++++++++++++++- xarray/core/concat.py | 1 - xarray/tests/test_backends.py | 93 ++++++++++----- xarray/tests/test_combine.py | 188 ++++++++++++++++++++++++++---- 5 files changed, 442 insertions(+), 67 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3b7060be6de..5a148286f9d 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -482,7 +482,7 @@ def close(self): f.close() -def open_mfdataset(paths, chunks=None, concat_dim='__auto_combine__', +def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', combine='auto', autoclose=None, parallel=False, **kwargs): @@ -648,15 +648,13 @@ def open_mfdataset(paths, chunks=None, concat_dim='__auto_combine__', # Combine all datasets, closing them in case of a ValueError try: if combine is 'auto': - # Will redo ordering from coordinates, ignoring how they were - # ordered previously - if concat_dim is not '__auto_combine__': - raise ValueError("Cannot specify dimensions to concatenate " - "along when auto-combining") - - combined = auto_combine(datasets, compat=compat, - data_vars=data_vars, coords=coords) - + # Use the old auto_combine for now + # After deprecation cycle from #2616 is complete this will redo + # ordering from coordinates, ignoring how they were ordered + # previously + combined = auto_combine(datasets, concat_dim=concat_dim, + compat=compat, data_vars=data_vars, + coords=coords) else: # Combined nested list by successive concat and merge operations # along each dimension, using structure given by "ids" diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 78b6dbbfe86..3d97974d3be 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -5,10 +5,7 @@ import pandas as pd from .dataarray import DataArray -from . import utils from .merge import merge -from .variable import IndexVariable, Variable, as_variable -from .variable import concat as concat_vars from .concat import concat @@ -370,8 +367,8 @@ def vars_as_keys(ds): return tuple(sorted(ds)) -def auto_combine(datasets, compat='no_conflicts', data_vars='all', - coords='different'): +def _auto_combine(datasets, compat='no_conflicts', data_vars='all', + coords='different'): """ Attempt to auto-magically combine the given datasets into one by using dimension coordinates. @@ -460,6 +457,9 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', temperature (x) float64 11.04 23.57 20.77 ... """ + # TODO to complete deprecation cycle in #2616 this should become the new + # auto_combine function (with this docstring) + # Group by data vars sorted_datasets = sorted(datasets, key=vars_as_keys) grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) @@ -482,3 +482,202 @@ def auto_combine(datasets, compat='no_conflicts', data_vars='all', concatenated_grouped_by_data_vars.append(concatenated) return merge(concatenated_grouped_by_data_vars, compat=compat) + + +# Everything beyond here is only needed for backwards compatibility, see #2616 + + +_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' + + +def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', + data_vars='all', coords='different'): + """ + Attempt to auto-magically combine the given datasets into one. + + This method attempts to combine a list of datasets into a single entity by + inspecting metadata and using a combination of concat and merge. + It does not concatenate along more than one dimension or sort data under + any circumstances. It does align coordinates, but different variables on + datasets can cause it to fail under some scenarios. In complex cases, you + may need to clean up your data and use ``concat``/``merge`` explicitly. + ``auto_combine`` works well if you have N years of data and M data + variables, and each combination of a distinct time period and set of data + variables is saved its own dataset. + + Parameters + ---------- + datasets : sequence of xarray.Dataset + Dataset objects to merge. + concat_dim : str or DataArray or Index, optional + Dimension along which to concatenate variables, as used by + :py:func:`xarray.concat`. You only need to provide this argument if + the dimension along which you want to concatenate is not a dimension + in the original datasets, e.g., if you want to stack a collection of + 2D arrays along a third dimension. + By default, xarray attempts to infer this argument by examining + component files. Set ``concat_dim=None`` explicitly to disable + concatenation. + compat : {'identical', 'equals', 'broadcast_equals', + 'no_conflicts'}, optional + String indicating how to compare variables of the same name for + potential conflicts: + - 'broadcast_equals': all values must be equal when variables are + broadcast against each other to ensure common dimensions. + - 'equals': all values and dimensions must be the same. + - 'identical': all values, dimensions and attributes must be the + same. + - 'no_conflicts': only values which are not null in both datasets + must be equal. The returned dataset then contains the combination + of all non-null values. + data_vars : {'minimal', 'different', 'all' or list of str}, optional + Details are in the documentation of concat + coords : {'minimal', 'different', 'all' o list of str}, optional + Details are in the documentation of concat + + Returns + ------- + combined : xarray.Dataset + + See also + -------- + concat + Dataset.merge + """ + + if concat_dim is '_not_supplied': + concat_dim = _CONCAT_DIM_DEFAULT + else: + message = """In xarray version 0.13 `auto_combine` and `open_mfdataset` + will no longer accept a `concat_dim` argument. To get + equivalent behaviour from now on please use the new + `manual_combine` function instead (or the + `combine='manual'` option to open_mfdataset).""" + warnings.warn(message, FutureWarning) + + if _dimension_coords_exist(datasets): + message = """The datasets supplied have global dimension coordinates. + From xarray version 0.13 the behaviour of `auto_combine` + and `open_mfdataset` will + change to use the values in these coordinates to order the + datasets before concatenation. in future, to continue + concatenating based on the order the datasets are supplied + in, please use the new `manual_combine` function (or the + `combine='manual'` option to open_mfdataset).""" + warnings.warn(message, FutureWarning) + else: + message = """The datasets supplied do not have global dimension + coordinates. From xarray version 0.13 the behaviour of + `auto_combine` and `open_mfdataset` will change to use the + values in these + coordinates to order the datasets before concatenation. + Datasets without global dimension coordinates will cease to + be valid arguments to `auto_combine`. In future, to continue + concatenating without supplying dimension coordinates, please + use the new `manual_combine` function (or the + `combine='manual'` option to open_mfdataset).""" + warnings.warn(message, FutureWarning) + + if _requires_concat_and_merge(datasets): + manual_dims = [concat_dim].append(None) + message = """The datasets supplied require both concatenation and + merging. From xarray version 0.13 this will operation will + require using the new `manual_combine` function (or the + `combine='manual'` option to open_mfdataset). You will + need to create a nested list structure such that you can + combine along the dimensions {}.""".format(manual_dims) + warnings.warn(message, FutureWarning) + + return _old_auto_combine(datasets, concat_dim=concat_dim, + compat=compat, data_vars=data_vars, + coords=coords) + + +def _dimension_coords_exist(datasets): + """ + Check if the datasets have consistent global dimension coordinates + which would in future be used by `auto_combine` for concatenation ordering. + """ + + # Group by data vars + sorted_datasets = sorted(datasets, key=vars_as_keys) + grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) + + # Perform the multidimensional combine on each group of data variables + # before merging back together + concatenated_grouped_by_data_vars = [] + try: + for vars, datasets_with_same_vars in grouped_by_vars: + _infer_concat_order_from_coords(list(datasets_with_same_vars)) + return True + except ValueError as err: + no_dimension_coords_errs = ["Every dimension needs a coordinate", + "neither monotonically increasing nor", + "Cannot handle size zero", + "Could not find any dimension coordinates"] + if any(message in str(err) for message in no_dimension_coords_errs): + # The ValueError just means that the datasets don't have + # global dimension coordinates + return False + else: + # There is a different problem + raise err + + +def _requires_concat_and_merge(datasets): + """ + Check if the datasets require the use of both xarray.concat and + xarray.merge, which in future might require the user to use + `manual_combine` instead. + """ + # Group by data vars + sorted_datasets = sorted(datasets, key=vars_as_keys) + grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) + + if len(list(grouped_by_vars)) > 1: + return True + else: + return False + + +def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', + data_vars='all', coords='different'): + from toolz import itertoolz + if concat_dim is not None: + dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), + datasets).values() + concatenated = [_auto_concat(ds, dim=dim, + data_vars=data_vars, coords=coords) + for ds in grouped] + else: + concatenated = datasets + merged = merge(concatenated, compat=compat) + return merged + + +def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): + if len(datasets) == 1 and dim is None: + # There is nothing more to combine, so kick out early. + return datasets[0] + else: + if dim is None: + ds0 = datasets[0] + ds1 = datasets[1] + concat_dims = set(ds0.dims) + if ds0.dims != ds1.dims: + dim_tuples = set(ds0.dims.items()) - set(ds1.dims.items()) + concat_dims = set(i for i, _ in dim_tuples) + if len(concat_dims) > 1: + concat_dims = set(d for d in concat_dims + if not ds0[d].equals(ds1[d])) + if len(concat_dims) > 1: + raise ValueError('too many different dimensions to ' + 'concatenate: %s' % concat_dims) + elif len(concat_dims) == 0: + raise ValueError('cannot infer dimension to concatenate: ' + 'supply the ``concat_dim`` argument ' + 'explicitly') + dim, = concat_dims + return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d38bf53136b..6a4ed682590 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -5,7 +5,6 @@ import pandas as pd - from . import utils from .alignment import align from .variable import IndexVariable, Variable, as_variable diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ddb8ab56deb..4a05cd68626 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1979,7 +1979,7 @@ def test_open_mfdataset_manyfiles(readengine, nfiles, parallel, chunks, subds.to_netcdf(tmpfiles[ii], engine=writeengine) # check that calculation on opened datasets works properly - with open_mfdataset(tmpfiles, combine='manual', concat_dim='x', + with open_mfdataset(tmpfiles, concat_dim='x', engine=readengine, parallel=parallel, chunks=chunks) as actual: @@ -2156,13 +2156,12 @@ def test_open_mfdataset(self): with create_tmp_file() as tmp2: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) assert_identical(original, actual) - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim='x', chunks={'x': 3}) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + chunks={'x': 3}) as actual: assert actual.foo.variable.data.chunks == ((3, 2, 3, 2),) with raises_regex(IOError, 'no files to open'): @@ -2211,8 +2210,7 @@ def test_open_mfdataset_pathlib(self): tmp2 = Path(tmp2) original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: assert_identical(original, actual) @requires_pathlib @@ -2247,8 +2245,7 @@ def test_open_mfdataset_2(self): original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: assert_identical(original, actual) def test_attrs_mfdataset(self): @@ -2261,8 +2258,7 @@ def test_attrs_mfdataset(self): ds2.attrs['test2'] = 'bar' ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: # presumes that attributes inherited from # first dataset loaded assert actual.test1 == ds1.test1 @@ -2282,11 +2278,6 @@ def test_open_mfdataset_auto_combine(self): with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(original, actual) - with raises_regex(ValueError, "Cannot specify dimensions to " - "concatenate along when " - "auto-combining"): - open_mfdataset([tmp1, tmp2], concat_dim='x') - def test_preprocess_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: @@ -2306,8 +2297,7 @@ def test_save_mfdataset_roundtrip(self): with create_tmp_file() as tmp1: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: assert_identical(actual, original) def test_save_mfdataset_invalid(self): @@ -2333,8 +2323,7 @@ def test_save_mfdataset_pathlib_roundtrip(self): tmp1 = Path(tmp1) tmp2 = Path(tmp2) save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: assert_identical(actual, original) def test_open_and_do_math(self): @@ -2351,8 +2340,7 @@ def test_open_mfdataset_concat_dim_none(self): data = Dataset({'x': 0}) data.to_netcdf(tmp1) Dataset({'x': np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim=None) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual: assert_identical(data, actual) def test_open_dataset(self): @@ -2379,8 +2367,7 @@ def test_open_single_dataset(self): {'baz': [100]}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset([tmp], combine='manual', - concat_dim=[dim]) as actual: + with open_mfdataset([tmp], concat_dim=dim) as actual: assert_identical(expected, actual) def test_open_multi_dataset(self): @@ -2403,8 +2390,7 @@ def test_open_multi_dataset(self): create_tmp_file() as tmp2: original.to_netcdf(tmp1) original.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim=dim) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim=dim) as actual: assert_identical(expected, actual) def test_dask_roundtrip(self): @@ -2456,11 +2442,62 @@ def test_save_mfdataset_compute_false_roundtrip(self): engine=self.engine, compute=False) assert isinstance(delayed_obj, Delayed) delayed_obj.compute() - with open_mfdataset([tmp1, tmp2], combine='manual', - concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: assert_identical(actual, original) +class TestOpenMFDataSetDeprecation: + """ + Set of tests to check that FutureWarnings are correctly raised until the + deprecation cycle is complete. #2616 + """ + def test_open_mfdataset_with_concat_dim(self): + ds1, ds2 = Dataset({'x': [0]}), Dataset({'x': [1]}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + + with pytest.warns(FutureWarning, match="`concat_dim`"): + open_mfdataset([tmp1, tmp2], concat_dim='x') + + def test_auto_combine_with_merge_and_concat(self): + ds1, ds2 = Dataset({'x': [0]}), Dataset({'x': [1]}) + ds3 = Dataset({'z': ((), 99)}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + with create_tmp_file() as tmp3: + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + ds3.to_netcdf(tmp3) + + with pytest.warns(FutureWarning, + match="require both concatenation"): + open_mfdataset([tmp1, tmp2, tmp3]) + + def test_auto_combine_with_coords(self): + ds1 = Dataset({'foo': ('x', [0])}, coords={'x': ('x', [0])}) + ds2 = Dataset({'foo': ('x', [1])}, coords={'x': ('x', [1])}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + + with pytest.warns(FutureWarning, match="supplied have global"): + open_mfdataset([tmp1, tmp2]) + + def test_auto_combine_without_coords(self): + ds1, ds2 = Dataset({'foo': ('x', [0])}), Dataset({'foo': ('x', [1])}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + + with pytest.warns(FutureWarning, + match="supplied do not have global"): + open_mfdataset([tmp1, tmp2]) + + @requires_scipy_or_netCDF4 @requires_pydap class TestPydap(object): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 201d90db8de..6c4fb3cb55c 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -1,6 +1,4 @@ from collections import OrderedDict -from copy import deepcopy - from itertools import product from datetime import datetime @@ -11,14 +9,14 @@ from xarray.core.combine import ( _new_tile_id, _check_shape_tile_ids, _combine_all_along_first_dim, _combine_nd, _infer_concat_order_from_positions, - _infer_concat_order_from_coords) + _infer_concat_order_from_coords, _auto_combine) from . import (assert_combined_tile_ids_equal, assert_identical, assert_equal, raises_regex) from .test_dataset import create_test_data -class TestTileIDsFromNestedList(object): +class TestTileIDsFromNestedList: def test_1d(self): ds = create_test_data input = [ds(0), ds(1)] @@ -103,7 +101,7 @@ def test_infer_from_datasets(self): assert_combined_tile_ids_equal(expected, actual) -class TestTileIDsFromCoords(object): +class TestTileIDsFromCoords: def test_1d(self): ds0 = Dataset({'x': [0, 1]}) ds1 = Dataset({'x': [2, 3]}) @@ -227,7 +225,7 @@ def _create_tile_ids(shape): return list(tile_ids) -class TestNewTileIDs(object): +class TestNewTileIDs: @pytest.mark.parametrize("old_id, new_id", [((3, 0, 1), (0, 1)), ((0, 0), (0,)), ((1,), ()), @@ -246,7 +244,7 @@ def test_get_new_tile_ids(self, create_combined_ids): assert expected_tile_ids == actual_tile_ids -class TestCombineND(object): +class TestCombineND: @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) @@ -292,7 +290,7 @@ def test_concat_twice(self, create_combined_ids, concat_dim): assert_equal(result, expected) -class TestCheckShapeTileIDs(object): +class TestCheckShapeTileIDs: def test_check_depths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} @@ -309,7 +307,7 @@ def test_check_lengths(self): _check_shape_tile_ids(combined_tile_ids) -class TestManualCombine(object): +class TestManualCombine: def test_manual_concat(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] expected = Dataset({'x': [0, 1]}) @@ -516,47 +514,51 @@ def test_combine_concat_over_redundant_nesting(self): assert_identical(expected, actual) -class TestAutoCombine(object): +class TestAutoCombine: + """ + When completing the deprecation cycle from #2616 change all + _auto_combine to auto_combine. + """ def test_auto_combine(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] - actual = auto_combine(objs) + actual = _auto_combine(objs) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) - actual = auto_combine([actual]) + actual = _auto_combine([actual]) assert_identical(expected, actual) objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] - actual = auto_combine(objs) + actual = _auto_combine(objs) expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) # ensure auto_combine handles non-sorted variables objs = [Dataset({'x': ('a', [0]), 'y': ('a', [0]), 'a': [0]}), Dataset({'x': ('a', [1]), 'y': ('a', [1]), 'a': [1]})] - actual = auto_combine(objs) + actual = _auto_combine(objs) expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1]), 'a': [0, 1]}) assert_identical(expected, actual) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] - actual = auto_combine(objs) + actual = _auto_combine(objs) expected = Dataset({'x': [0, 1], 'y': [0, 1]}) assert_equal(actual, expected) objs = [Dataset({'x': 0}), Dataset({'x': 1})] with raises_regex(ValueError, 'Could not find any dimension ' 'coordinates'): - auto_combine(objs) + _auto_combine(objs) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] with raises_regex(ValueError, 'Every dimension needs a coordinate'): - auto_combine(objs) + _auto_combine(objs) def test_infer_order_from_coords(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - actual = auto_combine(objs) + actual = _auto_combine(objs) expected = data assert expected.broadcast_equals(actual) @@ -568,7 +570,7 @@ def test_auto_combine_previously_failed(self): Dataset({'a': ('x', [1]), 'x': [1]})] expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}, {'x': [0, 1]}) - actual = auto_combine(datasets) + actual = _auto_combine(datasets) assert_identical(expected, actual) def test_auto_combine_still_fails(self): @@ -577,16 +579,16 @@ def test_auto_combine_still_fails(self): datasets = [Dataset({'x': 0}, {'y': 0}), Dataset({'x': 1}, {'y': 1, 'z': 1})] with pytest.raises(ValueError): - auto_combine(datasets, 'y') + _auto_combine(datasets, 'y') def test_auto_combine_no_concat(self): objs = [Dataset({'x': 0}), Dataset({'y': 1})] - actual = auto_combine(objs) + actual = _auto_combine(objs) expected = Dataset({'x': 0, 'y': 1}) assert_identical(expected, actual) objs = [Dataset({'x': 0, 'y': 1}), Dataset({'y': np.nan, 'z': 2})] - actual = auto_combine(objs) + actual = _auto_combine(objs) expected = Dataset({'x': 0, 'y': 1, 'z': 2}) assert_identical(expected, actual) @@ -599,8 +601,148 @@ def test_internal_ordering(self): Dataset({'bar': ('x', [10, 20]), 'x': [10, 20]}), Dataset({'foo': ('x', [2, 3]), 'x': [2, 3]}), Dataset({'bar': ('x', [30, 40]), 'x': [30, 40]})] - actual = auto_combine(objs) + actual = _auto_combine(objs) expected = Dataset({'foo': ('x', [0, 1, 2, 3]), 'bar': ('x', [10, 20, 30, 40]), 'x': [0, 1, 10, 20, 30, 40]}) assert_identical(expected, actual) + + +class TestAutoCombineOldAPI: + """ + Set of tests which check that old 1-dimensional auto_combine behaviour is + still satisfied. #2616 + """ + def test_auto_combine(self): + objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + actual = auto_combine([actual]) + assert_identical(expected, actual) + + objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1, 2]}) + assert_identical(expected, actual) + + # ensure auto_combine handles non-sorted variables + objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), + Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] + actual = auto_combine(objs) + expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) + assert_identical(expected, actual) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] + with raises_regex(ValueError, 'too many .* dimensions'): + auto_combine(objs) + + objs = [Dataset({'x': 0}), Dataset({'x': 1})] + with raises_regex(ValueError, 'cannot infer dimension'): + auto_combine(objs) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] + with pytest.raises(KeyError): + auto_combine(objs) + + def test_auto_combine_previously_failed(self): + # In the above scenario, one file is missing, containing the data for + # one year's data for one variable. + datasets = [Dataset({'a': ('x', [0]), 'x': [0]}), + Dataset({'b': ('x', [0]), 'x': [0]}), + Dataset({'a': ('x', [1]), 'x': [1]})] + expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}, + {'x': [0, 1]}) + actual = auto_combine(datasets) + assert_identical(expected, actual) + + # Your data includes "time" and "station" dimensions, and each year's + # data has a different set of stations. + datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), + Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] + expected = Dataset({'a': (('t', 'x'), + [[np.nan, 2, 3], [1, 2, np.nan]])}, + {'x': [0, 1, 2]}) + actual = auto_combine(datasets, concat_dim='t') + assert_identical(expected, actual) + + def test_auto_combine_still_fails(self): + # concat can't handle new variables (yet): + # https://github.com/pydata/xarray/issues/508 + datasets = [Dataset({'x': 0}, {'y': 0}), + Dataset({'x': 1}, {'y': 1, 'z': 1})] + with pytest.raises(ValueError): + auto_combine(datasets, 'y') + + def test_auto_combine_no_concat(self): + objs = [Dataset({'x': 0}), Dataset({'y': 1})] + actual = auto_combine(objs) + expected = Dataset({'x': 0, 'y': 1}) + assert_identical(expected, actual) + + objs = [Dataset({'x': 0, 'y': 1}), Dataset({'y': np.nan, 'z': 2})] + actual = auto_combine(objs) + expected = Dataset({'x': 0, 'y': 1, 'z': 2}) + assert_identical(expected, actual) + + data = Dataset({'x': 0}) + actual = auto_combine([data, data, data], concat_dim=None) + assert_identical(data, actual) + + # Single object, with a concat_dim explicitly provided + # Test the issue reported in GH #1988 + objs = [Dataset({'x': 0, 'y': 1})] + dim = DataArray([100], name='baz', dims='baz') + actual = auto_combine(objs, concat_dim=dim) + expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, + {'baz': [100]}) + assert_identical(expected, actual) + + # Just making sure that auto_combine is doing what is + # expected for non-scalar values, too. + objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] + dim = DataArray([100], name='baz', dims='baz') + actual = auto_combine(objs, concat_dim=dim) + expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), + 'y': (('baz', 'z'), [[1, 2]])}, + {'baz': [100]}) + assert_identical(expected, actual) + + def test_auto_combine_order_by_appearance_not_coords(self): + objs = [Dataset({'foo': ('x', [0])}, coords={'x': ('x', [1])}), + Dataset({'foo': ('x', [1])}, coords={'x': ('x', [0])})] + actual = auto_combine(objs) + expected = Dataset({'foo': ('x', [0, 1])}, + coords={'x': ('x', [1, 0])}) + assert_identical(expected, actual) + + +class TestAutoCombineDeprecation: + """ + Set of tests to check that FutureWarnings are correctly raised until the + deprecation cycle is complete. #2616 + """ + def test_auto_combine_with_concat_dim(self): + objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] + with pytest.warns(FutureWarning, match="`concat_dim`"): + auto_combine(objs, concat_dim='x') + + def test_auto_combine_with_merge_and_concat(self): + objs = [Dataset({'x': [0]}), + Dataset({'x': [1]}), + Dataset({'z': ((), 99)})] + with pytest.warns(FutureWarning, match="require both concatenation"): + auto_combine(objs) + + def test_auto_combine_with_coords(self): + objs = [Dataset({'foo': ('x', [0])}, coords={'x': ('x', [0])}), + Dataset({'foo': ('x', [1])}, coords={'x': ('x', [1])})] + with pytest.warns(FutureWarning, match="supplied have global"): + auto_combine(objs) + + def test_auto_combine_without_coords(self): + objs = [Dataset({'foo': ('x', [0])}), + Dataset({'foo': ('x', [1])})] + with pytest.warns(FutureWarning, match="supplied do not have global"): + auto_combine(objs) From 0990dd497ad7a063500e87ffcfad3dc0cfe849e1 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 3 Mar 2019 12:50:52 +0000 Subject: [PATCH 66/96] Fully reverted open_mfdataset tests --- xarray/tests/test_backends.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4a05cd68626..51cbc3913fe 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1979,8 +1979,7 @@ def test_open_mfdataset_manyfiles(readengine, nfiles, parallel, chunks, subds.to_netcdf(tmpfiles[ii], engine=writeengine) # check that calculation on opened datasets works properly - with open_mfdataset(tmpfiles, concat_dim='x', - engine=readengine, parallel=parallel, + with open_mfdataset(tmpfiles, engine=readengine, parallel=parallel, chunks=chunks) as actual: # check that using open_mfdataset returns dask arrays for variables @@ -2156,11 +2155,11 @@ def test_open_mfdataset(self): with create_tmp_file() as tmp2: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2]) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) assert_identical(original, actual) - with open_mfdataset([tmp1, tmp2], concat_dim='x', + with open_mfdataset([tmp1, tmp2], chunks={'x': 3}) as actual: assert actual.foo.variable.data.chunks == ((3, 2, 3, 2),) @@ -2210,7 +2209,7 @@ def test_open_mfdataset_pathlib(self): tmp2 = Path(tmp2) original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(original, actual) @requires_pathlib @@ -2245,7 +2244,7 @@ def test_open_mfdataset_2(self): original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(original, actual) def test_attrs_mfdataset(self): @@ -2258,7 +2257,7 @@ def test_attrs_mfdataset(self): ds2.attrs['test2'] = 'bar' ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2]) as actual: # presumes that attributes inherited from # first dataset loaded assert actual.test1 == ds1.test1 @@ -2297,7 +2296,7 @@ def test_save_mfdataset_roundtrip(self): with create_tmp_file() as tmp1: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(actual, original) def test_save_mfdataset_invalid(self): @@ -2323,7 +2322,7 @@ def test_save_mfdataset_pathlib_roundtrip(self): tmp1 = Path(tmp1) tmp2 = Path(tmp2) save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(actual, original) def test_open_and_do_math(self): @@ -2442,7 +2441,7 @@ def test_save_mfdataset_compute_false_roundtrip(self): engine=self.engine, compute=False) assert isinstance(delayed_obj, Delayed) delayed_obj.compute() - with open_mfdataset([tmp1, tmp2], concat_dim='x') as actual: + with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(actual, original) @@ -2806,7 +2805,6 @@ def test_uamiv_format_mfread(self): ['example.uamiv', 'example.uamiv'], engine='pseudonetcdf', - combine='manual', concat_dim=['TSTEP'], backend_kwargs={'format': 'uamiv'}) From d6277be83ea779d44a15df343f2e066241ea0730 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 3 Mar 2019 13:05:59 +0000 Subject: [PATCH 67/96] Updated what's new to match deprecation cycle --- doc/whats-new.rst | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d6e049e08bb..3a750d1748b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,23 +25,6 @@ Breaking changes Python 3 only. (:issue:`1876`). By `Joe Hamman `_. - -- Combining datasets along N dimensions: - - - ``open_mfdataset`` and ``auto_combine`` can now combine datasets along any - number of dimensions, instead of just a one-dimensional list of datasets. - - If the datasets have monotonic global dimension coordinates then the new - ``auto_combine`` should be used. If not then the new ``manual_combine`` - will accept the datasets as a a nested list-of-lists, and combine by - applying a series of concat and merge operations. - - Breaking because some lists that were previously valid inputs to - ``open_mfdataset`` and ``auto_combine`` may no longer be valid, and should - now be combined explicitly using ``manual_combine`` instead. - (:issue:`2159`) By `Tom Nicholas `_. - - Enhancements ~~~~~~~~~~~~ @@ -60,6 +43,21 @@ Enhancements report showing what exactly differs between the two objects (dimensions / coordinates / variables / attributes) (:issue:`1507`). By `Benoit Bovy `_. +- Combining datasets along N dimensions: + Datasets can now be combined along any number of dimensions, + instead of just a one-dimensional list of datasets. + + The new ``manual_combine`` will accept the datasets as a a nested + list-of-lists, and combine by applying a series of concat and merge + operations. + + ``open_mfdataset`` can use ``manual_combine`` to combine datasets along + multiple dimensions, by specifying `combine='manual'`. + + Some combinations of datasets will now throw FutureWarnings. To avoid these + switch to using `manual_combine` (or `combine='manual'` in `open_mfdataset`). + (:issue:`2159`) By `Tom Nicholas `_. + Bug fixes ~~~~~~~~~ From bf7d549c02ac0fa7fb85b7f33a291accab19aa02 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 3 Mar 2019 13:49:05 +0000 Subject: [PATCH 68/96] Reverted uamiv test --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8587b9c9aa2..442f791d54e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2894,7 +2894,7 @@ def test_uamiv_format_mfread(self): ['example.uamiv', 'example.uamiv'], engine='pseudonetcdf', - concat_dim=['TSTEP'], + concat_dim='TSTEP', backend_kwargs={'format': 'uamiv'}) data1 = np.arange(20, dtype='f').reshape(1, 1, 4, 5) From f00770f1b63f6b0e9c5570030fcc9fd338ef5997 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 3 Mar 2019 17:38:17 +0000 Subject: [PATCH 69/96] Removed dependency on itertools --- xarray/core/combine.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 3d97974d3be..625d235c9c4 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -643,14 +643,15 @@ def _requires_concat_and_merge(datasets): def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', data_vars='all', coords='different'): - from toolz import itertoolz if concat_dim is not None: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), - datasets).values() - concatenated = [_auto_concat(ds, dim=dim, + + sorted_datasets = sorted(datasets, key=vars_as_keys) + grouped = itertools.groupby(sorted_datasets, key=vars_as_keys) + + concatenated = [_auto_concat(list(datasets), dim=dim, data_vars=data_vars, coords=coords) - for ds in grouped] + for vars, datasets in grouped] else: concatenated = datasets merged = merge(concatenated, compat=compat) From c7c1746c1000853e44e7759116af6b8457d2567f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 3 Mar 2019 17:38:31 +0000 Subject: [PATCH 70/96] Deprecation tests fixed --- xarray/tests/test_backends.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 442f791d54e..f9f9920c1b6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2524,6 +2524,7 @@ def test_save_mfdataset_compute_false_roundtrip(self): assert_identical(actual, original) +@requires_scipy_or_netCDF4 class TestOpenMFDataSetDeprecation: """ Set of tests to check that FutureWarnings are correctly raised until the From f6192cad9d54b9967d6deaf5c303c33dc3668975 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 3 Mar 2019 18:03:04 +0000 Subject: [PATCH 71/96] Satisfy pycodestyle --- xarray/core/combine.py | 43 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 625d235c9c4..beb8c689e38 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -549,40 +549,39 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', concat_dim = _CONCAT_DIM_DEFAULT else: message = """In xarray version 0.13 `auto_combine` and `open_mfdataset` - will no longer accept a `concat_dim` argument. To get - equivalent behaviour from now on please use the new - `manual_combine` function instead (or the + will no longer accept a `concat_dim` argument. To get + equivalent behaviour from now on please use the new + `manual_combine` function instead (or the `combine='manual'` option to open_mfdataset).""" warnings.warn(message, FutureWarning) if _dimension_coords_exist(datasets): - message = """The datasets supplied have global dimension coordinates. + message = """The datasets supplied have global dimension coordinates. From xarray version 0.13 the behaviour of `auto_combine` - and `open_mfdataset` will - change to use the values in these coordinates to order the - datasets before concatenation. in future, to continue - concatenating based on the order the datasets are supplied - in, please use the new `manual_combine` function (or the - `combine='manual'` option to open_mfdataset).""" + and `open_mfdataset` will change to use the values in these + coordinates to order the datasets before concatenation. in + future, to continue concatenating based on the order the + datasets are supplied in, please use the new `manual_combine` + function (or the `combine='manual'` option to + open_mfdataset).""" warnings.warn(message, FutureWarning) else: - message = """The datasets supplied do not have global dimension - coordinates. From xarray version 0.13 the behaviour of - `auto_combine` and `open_mfdataset` will change to use the - values in these - coordinates to order the datasets before concatenation. - Datasets without global dimension coordinates will cease to - be valid arguments to `auto_combine`. In future, to continue - concatenating without supplying dimension coordinates, please - use the new `manual_combine` function (or the - `combine='manual'` option to open_mfdataset).""" + message = """The datasets supplied do not have global dimension + coordinates. From xarray version 0.13 the behaviour of + `auto_combine` and `open_mfdataset` will change to use the + values in these coordinates to order the datasets before + concatenation. Datasets without global dimension coordinates + will cease to be valid arguments to `auto_combine`. In + future, to continue concatenating without supplying dimension + coordinates, please use the new `manual_combine` function (or + the `combine='manual'` option to open_mfdataset).""" warnings.warn(message, FutureWarning) if _requires_concat_and_merge(datasets): manual_dims = [concat_dim].append(None) message = """The datasets supplied require both concatenation and merging. From xarray version 0.13 this will operation will - require using the new `manual_combine` function (or the + require using the new `manual_combine` function (or the `combine='manual'` option to open_mfdataset). You will need to create a nested list structure such that you can combine along the dimensions {}.""".format(manual_dims) @@ -608,7 +607,7 @@ def _dimension_coords_exist(datasets): concatenated_grouped_by_data_vars = [] try: for vars, datasets_with_same_vars in grouped_by_vars: - _infer_concat_order_from_coords(list(datasets_with_same_vars)) + _infer_concat_order_from_coords(list(datasets_with_same_vars)) return True except ValueError as err: no_dimension_coords_errs = ["Every dimension needs a coordinate", From 88f089e35bf7fa3e83ad574f420e016ae2bf7c81 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 09:54:49 +0000 Subject: [PATCH 72/96] Started deprecation cycle of auto_combine --- xarray/__init__.py | 2 +- xarray/backends/api.py | 40 +++++++++------ xarray/core/combine.py | 72 +++++++++++++-------------- xarray/tests/test_backends.py | 2 +- xarray/tests/test_combine.py | 93 ++++++++++++++++++----------------- 5 files changed, 110 insertions(+), 99 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index a20e2b16565..0a5633b1f2a 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -7,7 +7,7 @@ from .core.alignment import align, broadcast, broadcast_arrays from .core.common import full_like, zeros_like, ones_like from .core.concat import concat -from .core.combine import auto_combine, manual_combine +from .core.combine import combine_auto, combine_manual, auto_combine from .core.computation import apply_ufunc, dot, where from .core.extensions import (register_dataarray_accessor, register_dataset_accessor) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 93e35069df8..b6016c3154b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,7 +10,8 @@ from .. import Dataset, DataArray, backends, conventions from ..core import indexing from .. import auto_combine -from ..core.combine import _manual_combine, _infer_concat_order_from_positions +from ..core.combine import (combine_auto, _manual_combine, + _infer_concat_order_from_positions) from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import ArrayWriter from .locks import _get_scheduler @@ -507,14 +508,17 @@ def close(self): def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', - combine='auto', autoclose=None, parallel=False, **kwargs): + combine='_old_auto', autoclose=None, parallel=False, + **kwargs): """Open multiple files as a single dataset. - If combine='auto' then the function `auto_combine` is used to combine the + If combine='auto' then the function `combine_auto` is used to combine the datasets into one before returning the result, and if combine='manual' then - `manual_combine` is used. The filepaths must be structured according to + `combine_manual` is used. The filepaths must be structured according to which combining function is used, the details of which are given in the - documentation for ``auto_combine`` and ``manual_combine``. + documentation for ``combine_auto`` and ``combine_manual``. + By default the old (now deprecated) ``auto_combine`` will be used, please + specify either ``combine='auto'`` or ``combine='manual'`` in future. Requires dask to be installed. See documentation for details on dask [1]. Attributes from the first dataset file are used for the combined dataset. @@ -540,6 +544,10 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', if you want to stack a collection of 2D arrays along a third dimension. Set ``concat_dim=[..., None, ...]`` explicitly to disable concatenation along a particular dimension. + combine : {'auto', 'manual'}, optional + Whether ``xarray.auto_combine`` or ``xarray.manual_combine`` is used to + combine all the data. Default is to use ``xarray.auto_combine``, but + this function has been deprecated.. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for @@ -594,9 +602,6 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', parallel : bool, optional If True, the open and preprocess steps of this function will be performed in parallel using ``dask.delayed``. Default is False. - combine : {'auto', 'manual'}, optional - Whether ``xarray.auto_combine`` or ``xarray.manual_combine`` is used to - combine all the data. Default is 'auto'. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. @@ -606,8 +611,9 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', See Also -------- + combine_auto + combine_manual auto_combine - manual_combine open_dataset References @@ -669,20 +675,26 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', # Combine all datasets, closing them in case of a ValueError try: - if combine is 'auto': + if combine is '_old_auto': # Use the old auto_combine for now - # After deprecation cycle from #2616 is complete this will redo - # ordering from coordinates, ignoring how they were ordered - # previously + # Remove this after deprecation cycle from #2616 is complete combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, data_vars=data_vars, coords=coords) - else: + elif combine is 'manual': # Combined nested list by successive concat and merge operations # along each dimension, using structure given by "ids" combined = _manual_combine(datasets, concat_dims=concat_dim, compat=compat, data_vars=data_vars, coords=coords, ids=ids) + elif combine is 'auto': + # Redo ordering from coordinates, ignoring how they were ordered + # previously + combined = combine_auto(datasets, compat=compat, + data_vars=data_vars, coords=coords) + else: + raise ValueError("{} is an invalid option forthe keyword argument " + "``combine``".format(combine)) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index beb8c689e38..50770536b22 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -238,7 +238,7 @@ def _manual_combine(datasets, concat_dims, compat, data_vars, coords, ids): return combined -def manual_combine(datasets, concat_dim, compat='no_conflicts', +def combine_manual(datasets, concat_dim, compat='no_conflicts', data_vars='all', coords='different'): """ Explicitly combine an N-dimensional grid of datasets into one by using a @@ -312,7 +312,7 @@ def manual_combine(datasets, concat_dim, compat='no_conflicts', precipitation (x, y) float64 5.904 2.453 3.404 ... >>> ds_grid = [[x1y1, x1y2], [x2y1, x2y2]] - >>> combined = xr.manual_combine(ds_grid, concat_dim=['x', 'y']) + >>> combined = xr.combine_manual(ds_grid, concat_dim=['x', 'y']) Dimensions: (x: 4, y: 4) Dimensions without coordinates: x, y @@ -341,7 +341,7 @@ def manual_combine(datasets, concat_dim, compat='no_conflicts', precipitation (t) float64 5.904 2.453 3.404 ... >>> ds_grid = [[t1temp, t1precip], [t2temp, t2precip]] - >>> combined = xr.manual_combine(ds_grid, concat_dim=['t', None]) + >>> combined = xr.combine_manual(ds_grid, concat_dim=['t', None]) Dimensions: (t: 10) Dimensions without coordinates: t @@ -367,7 +367,7 @@ def vars_as_keys(ds): return tuple(sorted(ds)) -def _auto_combine(datasets, compat='no_conflicts', data_vars='all', +def combine_auto(datasets, compat='no_conflicts', data_vars='all', coords='different'): """ Attempt to auto-magically combine the given datasets into one by using @@ -423,7 +423,7 @@ def _auto_combine(datasets, compat='no_conflicts', data_vars='all', -------- concat merge - manual_combine + combine_manual Examples -------- @@ -448,7 +448,7 @@ def _auto_combine(datasets, compat='no_conflicts', data_vars='all', Data variables: temperature (x) float64 6.97 8.13 7.42 ... - >>> combined = xr.auto_combine([x2, x1]) + >>> combined = xr.combine_auto([x2, x1]) Dimensions: (x: 6) Coords: @@ -457,9 +457,6 @@ def _auto_combine(datasets, compat='no_conflicts', data_vars='all', temperature (x) float64 11.04 23.57 20.77 ... """ - # TODO to complete deprecation cycle in #2616 this should become the new - # auto_combine function (with this docstring) - # Group by data vars sorted_datasets = sorted(datasets, key=vars_as_keys) grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) @@ -484,7 +481,8 @@ def _auto_combine(datasets, compat='no_conflicts', data_vars='all', return merge(concatenated_grouped_by_data_vars, compat=compat) -# Everything beyond here is only needed for backwards compatibility, see #2616 +# Everything beyond here is only needed until the deprecation cycle in #2616 +# is completed _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' @@ -548,43 +546,43 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', if concat_dim is '_not_supplied': concat_dim = _CONCAT_DIM_DEFAULT else: - message = """In xarray version 0.13 `auto_combine` and `open_mfdataset` - will no longer accept a `concat_dim` argument. To get - equivalent behaviour from now on please use the new - `manual_combine` function instead (or the + message = """In xarray version 0.13 `auto_combine` will be deprecated, + and `open_mfdataset` will no longer accept a `concat_dim` + argument. To get equivalent behaviour from now on please use + the new `combine_manual` function instead (or the `combine='manual'` option to open_mfdataset).""" warnings.warn(message, FutureWarning) if _dimension_coords_exist(datasets): - message = """The datasets supplied have global dimension coordinates. - From xarray version 0.13 the behaviour of `auto_combine` - and `open_mfdataset` will change to use the values in these - coordinates to order the datasets before concatenation. in - future, to continue concatenating based on the order the - datasets are supplied in, please use the new `manual_combine` - function (or the `combine='manual'` option to - open_mfdataset).""" + message = """In xarray version 0.13 `auto_combine` will be deprecated. + The datasets supplied have global dimension coordinates. + You may want to use the new `combine_auto` function (or the + `combine='auto'` option to `open_mfdataset` to order the + datasets before concatenation. Alternatively, to continue + concatenating based on the order the datasets are supplied in + in future, please use the new `combine_manual` function (or + the `combine='manual'` option to open_mfdataset).""" warnings.warn(message, FutureWarning) else: - message = """The datasets supplied do not have global dimension - coordinates. From xarray version 0.13 the behaviour of - `auto_combine` and `open_mfdataset` will change to use the - values in these coordinates to order the datasets before - concatenation. Datasets without global dimension coordinates - will cease to be valid arguments to `auto_combine`. In - future, to continue concatenating without supplying dimension - coordinates, please use the new `manual_combine` function (or - the `combine='manual'` option to open_mfdataset).""" + message = """In xarray version 0.13 `auto_combine` will be deprecated. + The datasets supplied do not have global dimension + coordinates. In future, to continue concatenating without + supplying dimension coordinates, please use the new + `combine_manual` function (or the `combine='manual'` option + to open_mfdataset).""" warnings.warn(message, FutureWarning) if _requires_concat_and_merge(datasets): manual_dims = [concat_dim].append(None) - message = """The datasets supplied require both concatenation and - merging. From xarray version 0.13 this will operation will - require using the new `manual_combine` function (or the - `combine='manual'` option to open_mfdataset). You will - need to create a nested list structure such that you can - combine along the dimensions {}.""".format(manual_dims) + message = """In xarray version 0.13 `auto_combine` will be deprecated. + The datasets supplied require both concatenation and merging. + From xarray version 0.13 this will operation will require + either using the new `manual_combine` function (or the + `combine='manual'` option to open_mfdataset), with + a nested list structure such that you can combine along the + dimensions {}. Alternatively if your datasets have global + dimension coordinates then you can use the new `combine_auto` + function.""".format(manual_dims) warnings.warn(message, FutureWarning) return _old_auto_combine(datasets, concat_dim=concat_dim, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f9f9920c1b6..5e3bcc996f9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2353,7 +2353,7 @@ def test_open_mfdataset_auto_combine(self): original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp2, tmp1], combine='auto') as actual: assert_identical(original, actual) def test_preprocess_mfdataset(self): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 6c4fb3cb55c..026241917a7 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -5,11 +5,12 @@ import numpy as np import pytest -from xarray import DataArray, Dataset, concat, auto_combine, manual_combine +from xarray import DataArray, Dataset, concat, combine_auto, combine_manual +from xarray import auto_combine from xarray.core.combine import ( _new_tile_id, _check_shape_tile_ids, _combine_all_along_first_dim, _combine_nd, _infer_concat_order_from_positions, - _infer_concat_order_from_coords, _auto_combine) + _infer_concat_order_from_coords) from . import (assert_combined_tile_ids_equal, assert_identical, assert_equal, raises_regex) @@ -311,29 +312,29 @@ class TestManualCombine: def test_manual_concat(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] expected = Dataset({'x': [0, 1]}) - actual = manual_combine(objs, concat_dim='x') + actual = combine_manual(objs, concat_dim='x') assert_identical(expected, actual) - actual = manual_combine(objs, concat_dim=['x']) + actual = combine_manual(objs, concat_dim=['x']) assert_identical(expected, actual) - actual = manual_combine([actual], concat_dim=None) + actual = combine_manual([actual], concat_dim=None) assert_identical(expected, actual) objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] - actual = manual_combine(objs, concat_dim='x') + actual = combine_manual(objs, concat_dim='x') expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) # ensure manual_combine handles non-sorted variables objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] - actual = manual_combine(objs, concat_dim='a') + actual = combine_manual(objs, concat_dim='a') expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) assert_identical(expected, actual) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] with pytest.raises(KeyError): - manual_combine(objs, concat_dim='x') + combine_manual(objs, concat_dim='x') # TODO weird error from auto_concat on both of these when it tries to infer # dimension? @@ -341,51 +342,51 @@ def test_manual_concat(self): def test_manual_concat_too_many_dims_at_once(self): objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] with raises_regex(ValueError, 'too many .* dimensions'): - manual_combine(objs) + combine_manual(objs) objs = [Dataset({'x': 0}), Dataset({'x': 1})] with raises_regex(ValueError, 'cannot infer dimension'): - manual_combine(objs) + combine_manual(objs) def test_manual_concat_along_new_dim(self): objs = [Dataset({'a': ('x', [10]), 'x': [0]}), Dataset({'a': ('x', [20]), 'x': [0]})] expected = Dataset({'a': (('t', 'x'), [[10], [20]]), 'x': [0]}) - actual = manual_combine(objs, concat_dim='t') + actual = combine_manual(objs, concat_dim='t') assert_identical(expected, actual) # Same but with a DataArray as new dim, see GH #1988 and #2647 dim = DataArray([100, 150], name='baz', dims='baz') expected = Dataset({'a': (('baz', 'x'), [[10], [20]]), 'x': [0], 'baz': [100, 150]}) - actual = manual_combine(objs, concat_dim=dim) + actual = combine_manual(objs, concat_dim=dim) assert_identical(expected, actual) def test_manual_merge(self): data = Dataset({'x': 0}) - actual = manual_combine([data, data, data], concat_dim=None) + actual = combine_manual([data, data, data], concat_dim=None) assert_identical(data, actual) ds1 = Dataset({'a': ('x', [1, 2]), 'x': [0, 1]}) ds2 = Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}) expected = Dataset({'a': ('x', [1, 2, 3]), 'x': [0, 1, 2]}) - actual = manual_combine([ds1, ds2], concat_dim=None) + actual = combine_manual([ds1, ds2], concat_dim=None) assert_identical(expected, actual) - actual = manual_combine([ds1, ds2], concat_dim=[None]) + actual = combine_manual([ds1, ds2], concat_dim=[None]) assert_identical(expected, actual) tmp1 = Dataset({'x': 0}) tmp2 = Dataset({'x': np.nan}) - actual = manual_combine([tmp1, tmp2], concat_dim=None) + actual = combine_manual([tmp1, tmp2], concat_dim=None) assert_identical(tmp1, actual) - actual = manual_combine([tmp1, tmp2], concat_dim=[None]) + actual = combine_manual([tmp1, tmp2], concat_dim=[None]) assert_identical(tmp1, actual) # Single object, with a concat_dim explicitly provided # Test the issue reported in GH #1988 objs = [Dataset({'x': 0, 'y': 1})] dim = DataArray([100], name='baz', dims='baz') - actual = manual_combine(objs, concat_dim=[dim]) + actual = combine_manual(objs, concat_dim=[dim]) expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, {'baz': [100]}) assert_identical(expected, actual) @@ -394,7 +395,7 @@ def test_manual_merge(self): # expected for non-scalar values, too. objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] dim = DataArray([100], name='baz', dims='baz') - actual = manual_combine(objs, concat_dim=[dim]) + actual = combine_manual(objs, concat_dim=[dim]) expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), 'y': (('baz', 'z'), [[1, 2]])}, {'baz': [100]}) @@ -405,7 +406,7 @@ def test_concat_multiple_dims(self): Dataset({'a': (('x', 'y'), [[1]])})], [Dataset({'a': (('x', 'y'), [[2]])}), Dataset({'a': (('x', 'y'), [[3]])})]] - actual = manual_combine(objs, concat_dim=['x', 'y']) + actual = combine_manual(objs, concat_dim=['x', 'y']) expected = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]])}) assert_identical(expected, actual) @@ -417,9 +418,9 @@ def test_concat_name_symmetry(self): da3 = DataArray(name='a', data=[[2]], dims=['x', 'y']) da4 = DataArray(name='b', data=[[3]], dims=['x', 'y']) - x_first = manual_combine([[da1, da2], [da3, da4]], + x_first = combine_manual([[da1, da2], [da3, da4]], concat_dim=['x', 'y']) - y_first = manual_combine([[da1, da3], [da2, da4]], + y_first = combine_manual([[da1, da3], [da2, da4]], concat_dim=['y', 'x']) assert_identical(x_first, y_first) @@ -435,7 +436,7 @@ def test_concat_one_dim_merge_another(self): data2.var2.isel(dim2=slice(4, 9))]] expected = data[['var1', 'var2']] - actual = manual_combine(objs, concat_dim=[None, 'dim2']) + actual = combine_manual(objs, concat_dim=[None, 'dim2']) assert expected.identical(actual) def test_auto_combine_2d(self): @@ -447,7 +448,7 @@ def test_auto_combine_2d(self): expected = concat([partway1, partway2, partway3], dim='dim2') datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] - result = manual_combine(datasets, concat_dim=['dim1', 'dim2']) + result = combine_manual(datasets, concat_dim=['dim1', 'dim2']) assert_equal(result, expected) def test_manual_combine_missing_data_new_dim(self): @@ -458,7 +459,7 @@ def test_manual_combine_missing_data_new_dim(self): expected = Dataset({'a': (('t', 'x'), [[np.nan, 2, 3], [1, 2, np.nan]])}, {'x': [0, 1, 2]}) - actual = manual_combine(datasets, concat_dim='t') + actual = combine_manual(datasets, concat_dim='t') assert_identical(expected, actual) def test_invalid_hypercube_input(self): @@ -467,16 +468,16 @@ def test_invalid_hypercube_input(self): datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4)]] with raises_regex(ValueError, 'sub-lists do not have ' 'consistent lengths'): - manual_combine(datasets, concat_dim=['dim1', 'dim2']) + combine_manual(datasets, concat_dim=['dim1', 'dim2']) datasets = [[ds(0), ds(1)], [[ds(3), ds(4)]]] with raises_regex(ValueError, 'sub-lists do not have ' 'consistent depths'): - manual_combine(datasets, concat_dim=['dim1', 'dim2']) + combine_manual(datasets, concat_dim=['dim1', 'dim2']) datasets = [[ds(0), ds(1)], [ds(3), ds(4)]] with raises_regex(ValueError, 'concat_dims has length'): - manual_combine(datasets, concat_dim=['dim1']) + combine_manual(datasets, concat_dim=['dim1']) def test_merge_one_dim_concat_another(self): objs = [[Dataset({'foo': ('x', [0, 1])}), @@ -486,7 +487,7 @@ def test_merge_one_dim_concat_another(self): expected = Dataset({'foo': ('x', [0, 1, 2, 3]), 'bar': ('x', [10, 20, 30, 40])}) - actual = manual_combine(objs, concat_dim=['x', None], compat='equals') + actual = combine_manual(objs, concat_dim=['x', None], compat='equals') assert_identical(expected, actual) # Proving it works symmetrically @@ -494,22 +495,22 @@ def test_merge_one_dim_concat_another(self): Dataset({'foo': ('x', [2, 3])})], [Dataset({'bar': ('x', [10, 20])}), Dataset({'bar': ('x', [30, 40])})]] - actual = manual_combine(objs, concat_dim=[None, 'x'], compat='equals') + actual = combine_manual(objs, concat_dim=[None, 'x'], compat='equals') assert_identical(expected, actual) def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] - actual = manual_combine(objs, concat_dim=[None, 'x']) + actual = combine_manual(objs, concat_dim=[None, 'x']) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({'x': [0]})], [Dataset({'x': [1]})]] - actual = manual_combine(objs, concat_dim=['x', None]) + actual = combine_manual(objs, concat_dim=['x', None]) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({'x': [0]})]] - actual = manual_combine(objs, concat_dim=[None, None]) + actual = combine_manual(objs, concat_dim=[None, None]) expected = Dataset({'x': [0]}) assert_identical(expected, actual) @@ -521,44 +522,44 @@ class TestAutoCombine: """ def test_auto_combine(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] - actual = _auto_combine(objs) + actual = combine_auto(objs) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) - actual = _auto_combine([actual]) + actual = combine_auto([actual]) assert_identical(expected, actual) objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] - actual = _auto_combine(objs) + actual = combine_auto(objs) expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) # ensure auto_combine handles non-sorted variables objs = [Dataset({'x': ('a', [0]), 'y': ('a', [0]), 'a': [0]}), Dataset({'x': ('a', [1]), 'y': ('a', [1]), 'a': [1]})] - actual = _auto_combine(objs) + actual = combine_auto(objs) expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1]), 'a': [0, 1]}) assert_identical(expected, actual) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] - actual = _auto_combine(objs) + actual = combine_auto(objs) expected = Dataset({'x': [0, 1], 'y': [0, 1]}) assert_equal(actual, expected) objs = [Dataset({'x': 0}), Dataset({'x': 1})] with raises_regex(ValueError, 'Could not find any dimension ' 'coordinates'): - _auto_combine(objs) + combine_auto(objs) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] with raises_regex(ValueError, 'Every dimension needs a coordinate'): - _auto_combine(objs) + combine_auto(objs) def test_infer_order_from_coords(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - actual = _auto_combine(objs) + actual = combine_auto(objs) expected = data assert expected.broadcast_equals(actual) @@ -570,7 +571,7 @@ def test_auto_combine_previously_failed(self): Dataset({'a': ('x', [1]), 'x': [1]})] expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}, {'x': [0, 1]}) - actual = _auto_combine(datasets) + actual = combine_auto(datasets) assert_identical(expected, actual) def test_auto_combine_still_fails(self): @@ -579,16 +580,16 @@ def test_auto_combine_still_fails(self): datasets = [Dataset({'x': 0}, {'y': 0}), Dataset({'x': 1}, {'y': 1, 'z': 1})] with pytest.raises(ValueError): - _auto_combine(datasets, 'y') + combine_auto(datasets, 'y') def test_auto_combine_no_concat(self): objs = [Dataset({'x': 0}), Dataset({'y': 1})] - actual = _auto_combine(objs) + actual = combine_auto(objs) expected = Dataset({'x': 0, 'y': 1}) assert_identical(expected, actual) objs = [Dataset({'x': 0, 'y': 1}), Dataset({'y': np.nan, 'z': 2})] - actual = _auto_combine(objs) + actual = combine_auto(objs) expected = Dataset({'x': 0, 'y': 1, 'z': 2}) assert_identical(expected, actual) @@ -601,7 +602,7 @@ def test_internal_ordering(self): Dataset({'bar': ('x', [10, 20]), 'x': [10, 20]}), Dataset({'foo': ('x', [2, 3]), 'x': [2, 3]}), Dataset({'bar': ('x', [30, 40]), 'x': [30, 40]})] - actual = _auto_combine(objs) + actual = combine_auto(objs) expected = Dataset({'foo': ('x', [0, 1, 2, 3]), 'bar': ('x', [10, 20, 30, 40]), 'x': [0, 1, 10, 20, 30, 40]}) From 535bc3186c2c65ae9b63fb847e293eb6e718bc2c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 11:21:36 +0000 Subject: [PATCH 73/96] Added specific error for edge case combine_manual can't handle --- xarray/core/combine.py | 15 +++++++++++++-- xarray/tests/test_combine.py | 5 +++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 50770536b22..db0dbea5a6c 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -204,8 +204,19 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all', """ if concat_dim is not None: - combined = concat(datasets, dim=concat_dim, data_vars=data_vars, - coords=coords) + try: + combined = concat(datasets, dim=concat_dim, data_vars=data_vars, + coords=coords) + except ValueError as err: + if "encountered unexpected variable" in str(err): + raise ValueError("These objects cannot be combined along the " + "dimension {concat_dim} using only " + "xarray.concat, you must use " + "xarray.combine_auto instead, as this can " + "handle combining operations requiring both " + "concat and merge along the same dimension.") + else: + raise else: combined = merge(datasets, compat=compat) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 026241917a7..7b062d179fc 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -514,6 +514,11 @@ def test_combine_concat_over_redundant_nesting(self): expected = Dataset({'x': [0]}) assert_identical(expected, actual) + def test_manual_combine_but_need_auto_combine(self): + objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2], 'wall': [0]})] + with raises_regex(ValueError, 'cannot be combined'): + combine_manual(objs, concat_dim='x') + class TestAutoCombine: """ From 5d818e0f6fc3810868a4066a43fc6ff6b4f70574 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 12:48:03 +0000 Subject: [PATCH 74/96] Check that global coordinates are monotonic --- xarray/core/combine.py | 13 +++++++++---- xarray/tests/test_combine.py | 4 ---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index db0dbea5a6c..a7af5835087 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -96,8 +96,6 @@ def _infer_concat_order_from_coords(datasets): rank = series.rank(method='dense', ascending=ascending) order = rank.astype(int).values - 1 - # TODO check that resulting global coordinate is monotonic - # Append positions along extra dimension to structure which # encodes the multi-dimensional concatentation order tile_ids = [tile_id + (position,) for tile_id, position @@ -479,14 +477,21 @@ def combine_auto(datasets, compat='no_conflicts', data_vars='all', combined_ids, concat_dims = _infer_concat_order_from_coords( list(datasets_with_same_vars)) - # TODO checking the shape of the combined ids appropriate here? _check_shape_tile_ids(combined_ids) # Concatenate along all of concat_dims one by one to create single ds concatenated = _combine_nd(combined_ids, concat_dims=concat_dims, data_vars=data_vars, coords=coords) - # TODO check the overall coordinates are monotonically increasing? + # Check the overall coordinates are monotonically increasing + for dim in concatenated.dims: + if dim in concatenated: + indexes = concatenated.indexes.get(dim) + if not (indexes.is_monotonic_increasing + or indexes.is_monotonic_decreasing): + raise ValueError("Resulting object does not have monotonic" + "global indexes along dimension {}" + .format(dim)) concatenated_grouped_by_data_vars.append(concatenated) return merge(concatenated_grouped_by_data_vars, compat=compat) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 7b062d179fc..805e7f3d9a5 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -521,10 +521,6 @@ def test_manual_combine_but_need_auto_combine(self): class TestAutoCombine: - """ - When completing the deprecation cycle from #2616 change all - _auto_combine to auto_combine. - """ def test_auto_combine(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] actual = combine_auto(objs) From 42cd05d8abc19fb2cb56aeb1aff223180ac0e2d2 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 13:05:25 +0000 Subject: [PATCH 75/96] Highlighted weird behaviour when concatenating with no data variables --- xarray/tests/test_combine.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 805e7f3d9a5..221cab42846 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -336,17 +336,12 @@ def test_manual_concat(self): with pytest.raises(KeyError): combine_manual(objs, concat_dim='x') - # TODO weird error from auto_concat on both of these when it tries to infer - # dimension? + # TODO confused because this should not join up along 'y'?? @pytest.mark.xfail def test_manual_concat_too_many_dims_at_once(self): - objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] - with raises_regex(ValueError, 'too many .* dimensions'): - combine_manual(objs) - - objs = [Dataset({'x': 0}), Dataset({'x': 1})] - with raises_regex(ValueError, 'cannot infer dimension'): - combine_manual(objs) + objs = [Dataset({'x': [0], 'y': [1]}), Dataset({'y': [0], 'x': [1]})] + with pytest.raises(ValueError, "require both concatenation"): + result = combine_manual(objs, concat_dim='x') def test_manual_concat_along_new_dim(self): objs = [Dataset({'a': ('x', [10]), 'x': [0]}), From 8a838143cbf52a890a9775b5be61aa971f9567e9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 13:10:30 +0000 Subject: [PATCH 76/96] Added test for impossible-to-auto-combine coordinates --- xarray/core/combine.py | 4 +++- xarray/tests/test_combine.py | 17 +++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index a7af5835087..434374f2f9f 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -70,6 +70,8 @@ def _infer_concat_order_from_coords(datasets): # Infer order datasets should be arranged in along this dim concat_dims.append(dim) + print(indexes) + if all(index.is_monotonic_increasing for index in indexes): ascending = True elif all(index.is_monotonic_decreasing for index in indexes): @@ -490,7 +492,7 @@ def combine_auto(datasets, compat='no_conflicts', data_vars='all', if not (indexes.is_monotonic_increasing or indexes.is_monotonic_decreasing): raise ValueError("Resulting object does not have monotonic" - "global indexes along dimension {}" + " global indexes along dimension {}" .format(dim)) concatenated_grouped_by_data_vars.append(concatenated) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 221cab42846..c5f11387c24 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -150,16 +150,6 @@ def test_coord_monotonically_decreasing(self): assert_combined_tile_ids_equal(expected, actual) assert concat_dims == ['x'] - # TODO implement this error message - @pytest.mark.xfail - def test_check_for_impossible_ordering(self): - ds0 = Dataset({'x': [0, 1, 5]}) - ds1 = Dataset({'x': [2, 3]}) - with raises_regex(ValueError, "Unable to arrange datasets such that " - "coordinate values along dimension x are" - " monotonically increasing"): - _infer_concat_order_from_coords([ds1, ds0]) - def test_no_concatenation_needed(self): ds = Dataset({'foo': ('x', [0, 1])}) expected = {(): ds} @@ -604,6 +594,13 @@ def test_internal_ordering(self): 'x': [0, 1, 10, 20, 30, 40]}) assert_identical(expected, actual) + def test_check_for_impossible_ordering(self): + ds0 = Dataset({'x': [0, 1, 5]}) + ds1 = Dataset({'x': [2, 3]}) + with raises_regex(ValueError, "does not have monotonic global indexes" + " along dimension x"): + combine_auto([ds1, ds0]) + class TestAutoCombineOldAPI: """ From e4acbdc7790c625b0bdbed5a5e97ff88691b324a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 13:43:35 +0000 Subject: [PATCH 77/96] Removed uneeded test --- xarray/tests/test_combine.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index c5f11387c24..885f1a9e22c 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -579,21 +579,6 @@ def test_auto_combine_no_concat(self): expected = Dataset({'x': 0, 'y': 1, 'z': 2}) assert_identical(expected, actual) - # TODO decide if this test should be rewritten - @pytest.mark.xfail - def test_internal_ordering(self): - # This gives a MergeError if _auto_combine_1d is not sorting by - # data_vars correctly, see GH #2662 - objs = [Dataset({'foo': ('x', [0, 1]), 'x': [0, 1]}), - Dataset({'bar': ('x', [10, 20]), 'x': [10, 20]}), - Dataset({'foo': ('x', [2, 3]), 'x': [2, 3]}), - Dataset({'bar': ('x', [30, 40]), 'x': [30, 40]})] - actual = combine_auto(objs) - expected = Dataset({'foo': ('x', [0, 1, 2, 3]), - 'bar': ('x', [10, 20, 30, 40]), - 'x': [0, 1, 10, 20, 30, 40]}) - assert_identical(expected, actual) - def test_check_for_impossible_ordering(self): ds0 = Dataset({'x': [0, 1, 5]}) ds1 = Dataset({'x': [2, 3]}) From 8e767e2220b670b2ef46da8acd67f0979f66282c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 14:04:53 +0000 Subject: [PATCH 78/96] Satisfy linter --- xarray/backends/api.py | 2 +- xarray/core/combine.py | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 4177112ef50..8797cf9bb24 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -11,7 +11,7 @@ from ..core import indexing from .. import auto_combine from ..core.combine import (combine_auto, _manual_combine, - _infer_concat_order_from_positions) + _infer_concat_order_from_positions) from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import ArrayWriter from .locks import _get_scheduler diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 434374f2f9f..a05703e51ff 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -379,7 +379,7 @@ def vars_as_keys(ds): def combine_auto(datasets, compat='no_conflicts', data_vars='all', - coords='different'): + coords='different'): """ Attempt to auto-magically combine the given datasets into one by using dimension coordinates. @@ -565,9 +565,9 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', concat_dim = _CONCAT_DIM_DEFAULT else: message = """In xarray version 0.13 `auto_combine` will be deprecated, - and `open_mfdataset` will no longer accept a `concat_dim` - argument. To get equivalent behaviour from now on please use - the new `combine_manual` function instead (or the + and `open_mfdataset` will no longer accept a `concat_dim` + argument. To get equivalent behaviour from now on please use + the new `combine_manual` function instead (or the `combine='manual'` option to open_mfdataset).""" warnings.warn(message, FutureWarning) @@ -575,18 +575,18 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', message = """In xarray version 0.13 `auto_combine` will be deprecated. The datasets supplied have global dimension coordinates. You may want to use the new `combine_auto` function (or the - `combine='auto'` option to `open_mfdataset` to order the - datasets before concatenation. Alternatively, to continue + `combine='auto'` option to `open_mfdataset` to order the + datasets before concatenation. Alternatively, to continue concatenating based on the order the datasets are supplied in - in future, please use the new `combine_manual` function (or + in future, please use the new `combine_manual` function (or the `combine='manual'` option to open_mfdataset).""" warnings.warn(message, FutureWarning) else: message = """In xarray version 0.13 `auto_combine` will be deprecated. The datasets supplied do not have global dimension - coordinates. In future, to continue concatenating without - supplying dimension coordinates, please use the new - `combine_manual` function (or the `combine='manual'` option + coordinates. In future, to continue concatenating without + supplying dimension coordinates, please use the new + `combine_manual` function (or the `combine='manual'` option to open_mfdataset).""" warnings.warn(message, FutureWarning) @@ -594,11 +594,11 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', manual_dims = [concat_dim].append(None) message = """In xarray version 0.13 `auto_combine` will be deprecated. The datasets supplied require both concatenation and merging. - From xarray version 0.13 this will operation will require + From xarray version 0.13 this will operation will require either using the new `manual_combine` function (or the `combine='manual'` option to open_mfdataset), with - a nested list structure such that you can combine along the - dimensions {}. Alternatively if your datasets have global + a nested list structure such that you can combine along the + dimensions {}. Alternatively if your datasets have global dimension coordinates then you can use the new `combine_auto` function.""".format(manual_dims) warnings.warn(message, FutureWarning) From 3d0411286f8cea4ae4be0191920c1e75ba7fee27 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 15:28:40 +0000 Subject: [PATCH 79/96] Added airspeedvelocity benchmark for combining functions --- asv_bench/benchmarks/combine.py | 37 +++++++++++++++++++++++++++++++++ xarray/core/combine.py | 2 -- 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 asv_bench/benchmarks/combine.py diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py new file mode 100644 index 00000000000..2374b3cd2b4 --- /dev/null +++ b/asv_bench/benchmarks/combine.py @@ -0,0 +1,37 @@ +import numpy as np +import xarray as xr + + +class Combine: + """Benchmark concatenating and merging large datasets""" + + def setup(self): + """Create 4 datasets with two different variables""" + + t_size, x_size, y_size = 10, 90, 80 + t, x, y = np.arange(t_size), np.arange(x_size), np.arange(y_size) + data = np.random.randn(t_size, x_size, y_size) + + self.dsA0 = xr.Dataset( + {'A': xr.DataArray(data, coords={'T': t}, + dims=('T', 'X', 'Y'))}) + self.dsA1 = xr.Dataset( + {'A': xr.DataArray(data, coords={'T': t + t_size}, + dims=('T', 'X', 'Y'))}) + self.dsB0 = xr.Dataset( + {'B': xr.DataArray(data, coords={'T': t}, + dims=('T', 'X', 'Y'))}) + self.dsB1 = xr.Dataset( + {'B': xr.DataArray(data, coords={'T': t + t_size}, + dims=('T', 'X', 'Y'))}) + + def time_combine_manual(self): + datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]] + + xr.combine_manual(datasets, concat_dim=[None, 't']) + + def time_auto_combine(self): + """Also has to load and arrange t coordinate""" + datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1] + + xr.combine_auto(datasets) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index a05703e51ff..c69139d955a 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -70,8 +70,6 @@ def _infer_concat_order_from_coords(datasets): # Infer order datasets should be arranged in along this dim concat_dims.append(dim) - print(indexes) - if all(index.is_monotonic_increasing for index in indexes): ascending = True elif all(index.is_monotonic_decreasing for index in indexes): From 06ecef63bcab619d7d0b68687a7e787b7f536eb9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 15:54:58 +0000 Subject: [PATCH 80/96] Benchmark will take longer now --- asv_bench/benchmarks/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index 2374b3cd2b4..6ce55b17314 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -8,7 +8,7 @@ class Combine: def setup(self): """Create 4 datasets with two different variables""" - t_size, x_size, y_size = 10, 90, 80 + t_size, x_size, y_size = 100, 900, 800 t, x, y = np.arange(t_size), np.arange(x_size), np.arange(y_size) data = np.random.randn(t_size, x_size, y_size) From 513764fa4094acda67aec5fea8397bc5726afb90 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 18 Mar 2019 15:55:29 +0000 Subject: [PATCH 81/96] Updated version numbers in deprecation warnings to fit with recent release of 0.12 --- xarray/core/combine.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index c69139d955a..a7356941ffd 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -562,7 +562,7 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', if concat_dim is '_not_supplied': concat_dim = _CONCAT_DIM_DEFAULT else: - message = """In xarray version 0.13 `auto_combine` will be deprecated, + message = """In xarray version 0.14 `auto_combine` will be deprecated, and `open_mfdataset` will no longer accept a `concat_dim` argument. To get equivalent behaviour from now on please use the new `combine_manual` function instead (or the @@ -570,7 +570,7 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', warnings.warn(message, FutureWarning) if _dimension_coords_exist(datasets): - message = """In xarray version 0.13 `auto_combine` will be deprecated. + message = """In xarray version 0.14 `auto_combine` will be deprecated. The datasets supplied have global dimension coordinates. You may want to use the new `combine_auto` function (or the `combine='auto'` option to `open_mfdataset` to order the @@ -580,7 +580,7 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', the `combine='manual'` option to open_mfdataset).""" warnings.warn(message, FutureWarning) else: - message = """In xarray version 0.13 `auto_combine` will be deprecated. + message = """In xarray version 0.14 `auto_combine` will be deprecated. The datasets supplied do not have global dimension coordinates. In future, to continue concatenating without supplying dimension coordinates, please use the new @@ -590,9 +590,9 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', if _requires_concat_and_merge(datasets): manual_dims = [concat_dim].append(None) - message = """In xarray version 0.13 `auto_combine` will be deprecated. + message = """In xarray version 0.14 `auto_combine` will be deprecated. The datasets supplied require both concatenation and merging. - From xarray version 0.13 this will operation will require + From xarray version 0.14 this will operation will require either using the new `manual_combine` function (or the `combine='manual'` option to open_mfdataset), with a nested list structure such that you can combine along the From 13364ff9c03a483627c0397f463422efb040d7d0 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 18 May 2019 18:49:05 +0100 Subject: [PATCH 82/96] Updated api docs for new function names --- doc/api.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/api.rst b/doc/api.rst index 960ac7f08b4..7f9b377760e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -20,7 +20,8 @@ Top-level functions concat merge auto_combine - manual_combine + combine_auto + combine_manual where set_options full_like From ddfc6dd1aa2d3164e1e2e7f090dac95e8789cc1c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 18 May 2019 18:49:23 +0100 Subject: [PATCH 83/96] Fixed docs build failure --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 12c5d139fdc..e6dfaa90012 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1381,7 +1381,7 @@ def info(self, buf=None): See Also -------- pandas.DataFrame.assign - netCDF's ncdump + ncdump : netCDF's ncdump command line utility """ if buf is None: # pragma: no cover From e471a423cf418a092cdacf8137f4d56ea5c30956 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 19 May 2019 18:17:18 +0100 Subject: [PATCH 84/96] Revert "Fixed docs build failure" This reverts commit ddfc6dd1aa2d3164e1e2e7f090dac95e8789cc1c. --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e6dfaa90012..12c5d139fdc 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1381,7 +1381,7 @@ def info(self, buf=None): See Also -------- pandas.DataFrame.assign - ncdump : netCDF's ncdump command line utility + netCDF's ncdump """ if buf is None: # pragma: no cover From 2d5b90f14444f086d98e393d20e733856157052e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 19 May 2019 22:47:43 +0100 Subject: [PATCH 85/96] Updated documentation with section explaining new functions --- doc/combining.rst | 66 ++++++++++++++++++++++++++++++++++++++++-- doc/io.rst | 8 +++-- xarray/core/combine.py | 5 +++- 3 files changed, 74 insertions(+), 5 deletions(-) diff --git a/doc/combining.rst b/doc/combining.rst index 388cc2ba5f3..e2afb2d12b1 100644 --- a/doc/combining.rst +++ b/doc/combining.rst @@ -11,9 +11,10 @@ Combining data import xarray as xr np.random.seed(123456) -* For combining datasets or data arrays along a dimension, see concatenate_. +* For combining datasets or data arrays along a single dimension, see concatenate_. * For combining datasets with different variables, see merge_. * For combining datasets or data arrays with different indexes or missing values, see combine_. +* For combining datasets or data arrays along multiple dimensions see combining.multi_. .. _concatenate: @@ -77,7 +78,7 @@ Merge ~~~~~ To combine variables and coordinates between multiple ``DataArray`` and/or -``Dataset`` object, use :py:func:`~xarray.merge`. It can merge a list of +``Dataset`` objects, use :py:func:`~xarray.merge`. It can merge a list of ``Dataset``, ``DataArray`` or dictionaries of objects convertible to ``DataArray`` objects: @@ -237,3 +238,64 @@ coordinates as long as any non-missing values agree or are disjoint: Note that due to the underlying representation of missing values as floating point numbers (``NaN``), variable data type is not always preserved when merging in this manner. + +.. _combining.multi: + +Combining along multiple dimensions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For combining many objects along multiple dimensions xarray provides +``combine_manual`` and ``combine_auto``. These functions use a combination of +``concat`` and ``merge`` across different variables to combine many objects +into one. + +``combine_manual`` requires specifying the order in which the objects should be +combined, while ``combine_auto`` attempts to infer this ordering automatically +from the coordinates in the data. + +``manual_combine`` is useful when you know the spatial relationship between +each object in advance. A common task is collecting data from a parallelized +simulation where each processor wrote out data to a separate file. A domain +which was decomposed into 4 parts, 2 each along both the x and y axes, requires +organising the datasets into a doubly-nested list, e.g: + +.. ipython:: python + + arr = xr.DataArray(name='temperature', data=np.random.randint(5, size=(2, 2)), dims=['x', 'y']) + arr + ds_grid = [[arr, arr], [arr, arr]] + xr.combine_manual(ds_grid, concat_dim=['x', 'y']) + +``manual_combine`` can also be used to explicitly merge datasets with +different variables. For example if we have 4 datasets, which are divided +along two times, and contain two different variables, we can pass ``None`` +to ``'concat_dim'`` to specify the dimension of the nested list over which +we wish to use ``merge`` instead of ``concat``: + +.. ipython:: python + + temp = xr.DataArray(name='temperature', data=np.random.randn(2), dims=['t']) + precip = xr.DataArray(name='precipitation', data=np.random.randn(2), dims=['t']) + ds_grid = [[temp, precip], [temp, precip]] + xr.combine_manual(ds_grid, concat_dim=['t', None]) + +``combine_auto`` is for combining objects which have dimension coordinates +which specify their relationship to and order relative to one another, for +example a linearly-increasing 'time' dimension coordinate. + +Here we combine two datasets using their common dimension coordinates. Notice +they are concatenated in order based on the values in their dimension +coordinates, not on their position in the list passed to ``combine_auto``. + +.. ipython:: python + :okwarning: + + x1 = xr.DataArray(name='foo', data=np.random.randn(3), coords=[('x', [0, 1, 2])]) + x2 = xr.DataArray(name='foo', data=np.random.randn(3), coords=[('x', [3, 4, 5])]) + xr.combine_auto([x2, x1]) + +These functions can be used by ``open_mfdataset`` to open many files as one +dataset. The particular function used is specified by setting the argument +``'combine'`` to ``'auto'`` or ``'manual'``. This is useful for situations +where your data is split across many files in multiple locations, which have +some known relationship between one another. \ No newline at end of file diff --git a/doc/io.rst b/doc/io.rst index 51c747189da..983ae0ea679 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -759,7 +759,10 @@ Combining multiple files NetCDF files are often encountered in collections, e.g., with different files corresponding to different model runs. xarray can straightforwardly combine such -files into a single Dataset by making use of :py:func:`~xarray.concat`. +files into a single Dataset by making use of :py:func:`~xarray.concat`, +:py:func:`~xarray.merge`, :py:func:`~xarray.combine_manual` and +:py:func:`~xarray.combine_auto`. For details on the difference between these +functions see :ref:`combining data`. .. note:: @@ -772,7 +775,8 @@ files into a single Dataset by making use of :py:func:`~xarray.concat`. This function automatically concatenates and merges multiple files into a single xarray dataset. It is the recommended way to open multiple files with xarray. - For more details, see :ref:`dask.io` and a `blog post`_ by Stephan Hoyer. + For more details, see :ref:`combining.multi`, :ref:`dask.io` and a + `blog post`_ by Stephan Hoyer. .. _dask: http://dask.pydata.org .. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ diff --git a/xarray/core/combine.py b/xarray/core/combine.py index a7356941ffd..63a8589fea0 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -439,7 +439,7 @@ def combine_auto(datasets, compat='no_conflicts', data_vars='all', Combining two datasets using their common dimension coordinates. Notice they are concatenated based on the values in their dimension coordinates, - not on their position in the list passed to `auto_combine`. + not on their position in the list passed to `combine_auto`. >>> x1 @@ -519,6 +519,9 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', variables, and each combination of a distinct time period and set of data variables is saved its own dataset. + This entire function is in the process of being deprecated in favour of + ``combine_manual`` and ``combine_auto``. + Parameters ---------- datasets : sequence of xarray.Dataset From 9ead34edd9e62261500c0f33967f55b186bba531 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 20 May 2019 10:24:50 +0100 Subject: [PATCH 86/96] Suppressed deprecation warnings in test suite --- xarray/tests/test_backends.py | 6 ++++++ xarray/tests/test_combine.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 28f5ddf75e4..9aa7abc9880 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2163,6 +2163,8 @@ def skip_if_not_engine(engine): pytest.importorskip(engine) +@pytest.mark.filterwarnings("ignore:In xarray version 0.14 `auto_combine` " + "will be deprecated") def test_open_mfdataset_manyfiles(readengine, nfiles, parallel, chunks, file_cache_maxsize): @@ -2196,6 +2198,8 @@ def test_open_mfdataset_manyfiles(readengine, nfiles, parallel, chunks, @requires_scipy_or_netCDF4 +@pytest.mark.filterwarnings("ignore:In xarray version 0.14 `auto_combine` " + "will be deprecated") class TestOpenMFDatasetWithDataVarsAndCoordsKw: coord_name = 'lon' var_name = 'v1' @@ -2301,6 +2305,8 @@ def test_invalid_data_vars_value_should_fail(self): @requires_dask @requires_scipy @requires_netCDF4 +@pytest.mark.filterwarnings("ignore:In xarray version 0.14 `auto_combine` " + "will be deprecated") class TestDask(DatasetIOBase): @contextlib.contextmanager def create_store(self): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 0c217b46a71..5073fbae13a 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -593,6 +593,8 @@ def test_check_for_impossible_ordering(self): combine_auto([ds1, ds0]) +@pytest.mark.filterwarnings("ignore:In xarray version 0.14 `auto_combine` " + "will be deprecated") class TestAutoCombineOldAPI: """ Set of tests which check that old 1-dimensional auto_combine behaviour is From fab3586e01215ff4dad56f6e030552853c257b28 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 20 May 2019 12:53:45 +0100 Subject: [PATCH 87/96] Resolved ToDo by pointing to issue with concat, see #2975 --- xarray/tests/test_combine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 5073fbae13a..d2aab62f9f3 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -332,12 +332,12 @@ def test_manual_concat(self): with pytest.raises(KeyError): combine_manual(objs, concat_dim='x') - # TODO confused because this should not join up along 'y'?? + # Fails because of concat's weird treatment of dimension coords, see #2975 @pytest.mark.xfail def test_manual_concat_too_many_dims_at_once(self): objs = [Dataset({'x': [0], 'y': [1]}), Dataset({'y': [0], 'x': [1]})] - with pytest.raises(ValueError, "require both concatenation"): - result = combine_manual(objs, concat_dim='x') + with pytest.raises(ValueError, "not equal across datasets"): + combine_manual(objs, concat_dim='x', coords='minimal') def test_manual_concat_along_new_dim(self): objs = [Dataset({'a': ('x', [10]), 'x': [0]}), From 9d5e29f0bf2562a3fd2096c577129ec9fb208b9e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 20 May 2019 19:06:29 +0100 Subject: [PATCH 88/96] Various docs fixes --- doc/combining.rst | 23 +++++++++++++++++------ doc/whats-new.rst | 32 +++++++++++++++++--------------- xarray/core/combine.py | 16 ++++++++-------- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/doc/combining.rst b/doc/combining.rst index e2afb2d12b1..b7a55efcf0d 100644 --- a/doc/combining.rst +++ b/doc/combining.rst @@ -244,6 +244,15 @@ in this manner. Combining along multiple dimensions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. note:: + + There are currently three combining functions with similar names: + ``auto_combine``, ``combine_auto``, and ``combine_manual``. This is because + ``auto_combine`` is in the process of being deprecated in favour of the other + two functions, which are more general. If your code currently relies on + ``auto_combine``, then you will be able to get similar functionality by using + ``combine_manual``. + For combining many objects along multiple dimensions xarray provides ``combine_manual`` and ``combine_auto``. These functions use a combination of ``concat`` and ``merge`` across different variables to combine many objects @@ -253,11 +262,13 @@ into one. combined, while ``combine_auto`` attempts to infer this ordering automatically from the coordinates in the data. -``manual_combine`` is useful when you know the spatial relationship between -each object in advance. A common task is collecting data from a parallelized -simulation where each processor wrote out data to a separate file. A domain -which was decomposed into 4 parts, 2 each along both the x and y axes, requires -organising the datasets into a doubly-nested list, e.g: +``combine_manual`` is useful when you know the spatial relationship between +each object in advance. The datasets must be provided in form of a nested list, +which specifies their relative position and ordering. A common task is +collecting data from a parallelized simulation where each processor wrote out +data to a separate file. A domain which was decomposed into 4 parts, 2 each +along both the x and y axes, requires organising the datasets into a +doubly-nested list, e.g: .. ipython:: python @@ -266,7 +277,7 @@ organising the datasets into a doubly-nested list, e.g: ds_grid = [[arr, arr], [arr, arr]] xr.combine_manual(ds_grid, concat_dim=['x', 'y']) -``manual_combine`` can also be used to explicitly merge datasets with +``combine_manual`` can also be used to explicitly merge datasets with different variables. For example if we have 4 datasets, which are divided along two times, and contain two different variables, we can pass ``None`` to ``'concat_dim'`` to specify the dimension of the nested list over which diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7b97d6b2608..ab5388ef031 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,6 +35,23 @@ Enhancements helpful for avoiding file-lock errors when trying to write to files opened using ``open_dataset()`` or ``open_dataarray()``. (:issue:`2887`) By `Dan Nowacki `_. +- Combining datasets along N dimensions: + Datasets can now be combined along any number of dimensions, + instead of just a one-dimensional list of datasets. + + The new ``combine_manual`` will accept the datasets as a a nested + list-of-lists, and combine by applying a series of concat and merge + operations. The new ``combine_auto`` will instead use the dimension + coordinates of the datasets to order them. + + ``open_mfdataset`` can use either ``combine_manual`` or ``combine_auto`` to + combine datasets along multiple dimensions, by specifying the argument + `combine='manual'` or `combine='auto'`. + + This means that the original function ``auto_combine`` is being deprecated. + To avoid FutureWarnings switch to using `combine_manual` or `combine_auto`, + (or set the `combine` argument in `open_mfdataset`). (:issue:`2159`) + By `Tom Nicholas `_. Bug fixes ~~~~~~~~~ @@ -149,21 +166,6 @@ Other enhancements report showing what exactly differs between the two objects (dimensions / coordinates / variables / attributes) (:issue:`1507`). By `Benoit Bovy `_. -- Combining datasets along N dimensions: - Datasets can now be combined along any number of dimensions, - instead of just a one-dimensional list of datasets. - - The new ``manual_combine`` will accept the datasets as a a nested - list-of-lists, and combine by applying a series of concat and merge - operations. - - ``open_mfdataset`` can use ``manual_combine`` to combine datasets along - multiple dimensions, by specifying `combine='manual'`. - - Some combinations of datasets will now throw FutureWarnings. To avoid these - switch to using `manual_combine` (or `combine='manual'` in `open_mfdataset`). - (:issue:`2159`) By `Tom Nicholas `_. - - Resampling of standard and non-standard calendars indexed by :py:class:`~xarray.CFTimeIndex` is now possible. (:issue:`2191`). By `Jwen Fai Low `_ and diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 63a8589fea0..6a3d9069993 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -253,11 +253,11 @@ def combine_manual(datasets, concat_dim, compat='no_conflicts', Explicitly combine an N-dimensional grid of datasets into one by using a succession of concat and merge operations along each dimension of the grid. - Does not sort data under any circumstances, so the datasets must be passed - in the order you wish them to be concatenated. It does align coordinates, - but different variables on datasets can cause it to fail under some - scenarios. In complex cases, you may need to clean up your data and use - concat/merge explicitly. + Does not sort the supplied datasets under any circumstances, so the + datasets must be passed in the order you wish them to be concatenated. It + does align coordinates, but different variables on datasets can cause it to + fail under some scenarios. In complex cases, you may need to clean up your + data and use concat/merge explicitly. To concatenate along multiple dimensions the datasets must be passed as a nested list-of-lists, with a depth equal to the length of ``concat_dims``. @@ -509,6 +509,9 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', """ Attempt to auto-magically combine the given datasets into one. + This entire function is in the process of being deprecated in favour of + ``combine_manual`` and ``combine_auto``. + This method attempts to combine a list of datasets into a single entity by inspecting metadata and using a combination of concat and merge. It does not concatenate along more than one dimension or sort data under @@ -519,9 +522,6 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', variables, and each combination of a distinct time period and set of data variables is saved its own dataset. - This entire function is in the process of being deprecated in favour of - ``combine_manual`` and ``combine_auto``. - Parameters ---------- datasets : sequence of xarray.Dataset From ae7b8110f0c7ad7f874cb14e6ba3571e50a0a840 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 28 May 2019 10:29:05 +0100 Subject: [PATCH 89/96] Slightly renamed tests to match new name of tested function --- xarray/tests/test_combine.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 990a0e4856d..60b98d6e676 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -527,8 +527,8 @@ def test_combine_manual_fill_value(self, fill_value): assert_identical(expected, actual) -class TestAutoCombine: - def test_auto_combine(self): +class TestCombineAuto: + def test_combine_auto(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] actual = combine_auto(objs) expected = Dataset({'x': [0, 1]}) @@ -571,7 +571,7 @@ def test_infer_order_from_coords(self): expected = data assert expected.broadcast_equals(actual) - def test_auto_combine_previously_failed(self): + def test_combine_auto_previously_failed(self): # In the above scenario, one file is missing, containing the data for # one year's data for one variable. datasets = [Dataset({'a': ('x', [0]), 'x': [0]}), @@ -582,7 +582,7 @@ def test_auto_combine_previously_failed(self): actual = combine_auto(datasets) assert_identical(expected, actual) - def test_auto_combine_still_fails(self): + def test_combine_auto_still_fails(self): # concat can't handle new variables (yet): # https://github.com/pydata/xarray/issues/508 datasets = [Dataset({'x': 0}, {'y': 0}), @@ -590,7 +590,7 @@ def test_auto_combine_still_fails(self): with pytest.raises(ValueError): combine_auto(datasets, 'y') - def test_auto_combine_no_concat(self): + def test_combine_auto_no_concat(self): objs = [Dataset({'x': 0}), Dataset({'y': 1})] actual = combine_auto(objs) expected = Dataset({'x': 0, 'y': 1}) From f4fc03daec6e38aa3003ca9d10905496017e4fb4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 28 May 2019 10:36:48 +0100 Subject: [PATCH 90/96] Included minor suggestions from shoyer --- doc/combining.rst | 53 +++++++++++----------- xarray/backends/api.py | 11 +++-- xarray/core/combine.py | 88 +++++++++++++++++++----------------- xarray/core/concat.py | 2 - xarray/tests/test_combine.py | 9 ++++ 5 files changed, 88 insertions(+), 75 deletions(-) diff --git a/doc/combining.rst b/doc/combining.rst index b7a55efcf0d..852157e748f 100644 --- a/doc/combining.rst +++ b/doc/combining.rst @@ -247,28 +247,29 @@ Combining along multiple dimensions .. note:: There are currently three combining functions with similar names: - ``auto_combine``, ``combine_auto``, and ``combine_manual``. This is because + :py:func:`~xarray.auto_combine`, :py:func:`~xarray.combine_auto`, and + :py:func:`~xarray.combine_manual`. This is because ``auto_combine`` is in the process of being deprecated in favour of the other two functions, which are more general. If your code currently relies on ``auto_combine``, then you will be able to get similar functionality by using ``combine_manual``. For combining many objects along multiple dimensions xarray provides -``combine_manual`` and ``combine_auto``. These functions use a combination of -``concat`` and ``merge`` across different variables to combine many objects -into one. - -``combine_manual`` requires specifying the order in which the objects should be -combined, while ``combine_auto`` attempts to infer this ordering automatically -from the coordinates in the data. - -``combine_manual`` is useful when you know the spatial relationship between -each object in advance. The datasets must be provided in form of a nested list, -which specifies their relative position and ordering. A common task is -collecting data from a parallelized simulation where each processor wrote out -data to a separate file. A domain which was decomposed into 4 parts, 2 each -along both the x and y axes, requires organising the datasets into a -doubly-nested list, e.g: +:py:func:`~xarray.combine_manual`` and :py:func:`~xarray.combine_auto`. These +functions use a combination of ``concat`` and ``merge`` across different +variables to combine many objects into one. + +:py:func:`~xarray.combine_manual`` requires specifying the order in which the +objects should be combined, while :py:func:`~xarray.combine_auto` attempts to +infer this ordering automatically from the coordinates in the data. + +:py:func:`~xarray.combine_manual` is useful when you know the spatial +relationship between each object in advance. The datasets must be provided in +the form of a nested list, which specifies their relative position and +ordering. A common task is collecting data from a parallelized simulation where +each processor wrote out data to a separate file. A domain which was decomposed +into 4 parts, 2 each along both the x and y axes, requires organising the +datasets into a doubly-nested list, e.g: .. ipython:: python @@ -277,8 +278,8 @@ doubly-nested list, e.g: ds_grid = [[arr, arr], [arr, arr]] xr.combine_manual(ds_grid, concat_dim=['x', 'y']) -``combine_manual`` can also be used to explicitly merge datasets with -different variables. For example if we have 4 datasets, which are divided +:py:func:`~xarray.combine_manual` can also be used to explicitly merge datasets +with different variables. For example if we have 4 datasets, which are divided along two times, and contain two different variables, we can pass ``None`` to ``'concat_dim'`` to specify the dimension of the nested list over which we wish to use ``merge`` instead of ``concat``: @@ -290,9 +291,9 @@ we wish to use ``merge`` instead of ``concat``: ds_grid = [[temp, precip], [temp, precip]] xr.combine_manual(ds_grid, concat_dim=['t', None]) -``combine_auto`` is for combining objects which have dimension coordinates -which specify their relationship to and order relative to one another, for -example a linearly-increasing 'time' dimension coordinate. +:py:func:`~xarray.combine_auto` is for combining objects which have dimension +coordinates which specify their relationship to and order relative to one +another, for example a linearly-increasing 'time' dimension coordinate. Here we combine two datasets using their common dimension coordinates. Notice they are concatenated in order based on the values in their dimension @@ -305,8 +306,8 @@ coordinates, not on their position in the list passed to ``combine_auto``. x2 = xr.DataArray(name='foo', data=np.random.randn(3), coords=[('x', [3, 4, 5])]) xr.combine_auto([x2, x1]) -These functions can be used by ``open_mfdataset`` to open many files as one -dataset. The particular function used is specified by setting the argument -``'combine'`` to ``'auto'`` or ``'manual'``. This is useful for situations -where your data is split across many files in multiple locations, which have -some known relationship between one another. \ No newline at end of file +These functions can be used by :py:func:`~xarray.open_mfdataset` to open many +files as one dataset. The particular function used is specified by setting the +argument ``'combine'`` to ``'auto'`` or ``'manual'``. This is useful for +situations where your data is split across many files in multiple locations, +which have some known relationship between one another. \ No newline at end of file diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 31e192125cb..51c5495b6c3 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -633,8 +633,9 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', disable concatenation along a particular dimension. combine : {'auto', 'manual'}, optional Whether ``xarray.auto_combine`` or ``xarray.manual_combine`` is used to - combine all the data. Default is to use ``xarray.auto_combine``, but - this function has been deprecated.. + combine all the data. If this argument is not provided, + `xarray.auto_combine` is used, but in the future this behavior will + switch to use `xarray.combine_auto`. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for @@ -732,7 +733,7 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', # If combine='auto' then this is unnecessary, but quick. # If combine='manual' then this creates a flat list which is easier to # iterate over, while saving the originally-supplied structure as "ids" - if combine is 'manual': + if combine == 'manual': if concat_dim is '__auto_combine__': raise ValueError("Must supply concat_dim when using manual " "combine") @@ -787,8 +788,8 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', combined = combine_auto(datasets, compat=compat, data_vars=data_vars, coords=coords) else: - raise ValueError("{} is an invalid option forthe keyword argument " - "``combine``".format(combine)) + raise ValueError("{} is an invalid option for the keyword argument" + " ``combine``".format(combine)) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 3528a3a686d..58fa8be11aa 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,10 +1,12 @@ import itertools import warnings from collections import Counter, OrderedDict +from textwrap import dedent import pandas as pd from .dataarray import DataArray +from .dataset import Dataset from . import dtypes from .merge import merge from .concat import concat @@ -83,11 +85,10 @@ def _infer_concat_order_from_coords(datasets): # Assume that any two datasets whose coord along dim starts # with the same value have the same coord values throughout. - try: - first_items = pd.Index([index.take([0]) - for index in indexes]) - except IndexError: + if any(index.size == 0 for index in indexes): raise ValueError('Cannot handle size zero dimensions') + first_items = pd.Index([index.take([0]) + for index in indexes]) # Sort datasets along dim # We want rank but with identical elements given identical @@ -117,6 +118,9 @@ def _check_shape_tile_ids(combined_tile_ids): # Check all tuples are the same length # i.e. check that all lists are nested to the same depth nesting_depths = [len(tile_id) for tile_id in tile_ids] + print(nesting_depths) + if not nesting_depths: + nesting_depths = [0] if not set(nesting_depths) == {nesting_depths[0]}: raise ValueError("The supplied objects do not form a hypercube because" " sub-lists do not have consistent depths") @@ -157,13 +161,13 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', combined_ds : xarray.Dataset """ - tile_id, ds = list(combined_ids.items())[0] + example_tile_id = next(iter(combined_ids.keys())) - n_dims = len(tile_id) + n_dims = len(example_tile_id) if len(concat_dims) != n_dims: raise ValueError("concat_dims has length {} but the datasets " "passed are nested in a {}-dimensional structure" - .format(str(len(concat_dims)), str(n_dims))) + .format(len(concat_dims), n_dims)) # Each iteration of this loop reduces the length of the tile_ids tuples # by one. It always combines along the first dimension, removing the first @@ -175,7 +179,7 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', coords=coords, compat=compat, fill_value=fill_value) - combined_ds = list(combined_ids.values())[0] + (combined_ds,) = combined_ids.values() return combined_ds @@ -233,6 +237,9 @@ def _new_tile_id(single_id_ds_pair): def _manual_combine(datasets, concat_dims, compat, data_vars, coords, ids, fill_value=dtypes.NA): + if len(datasets) == 0: + return Dataset() + # Arrange datasets for concatenation # Use information from the shape of the user input if not ids: @@ -522,8 +529,8 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', """ Attempt to auto-magically combine the given datasets into one. - This entire function is in the process of being deprecated in favour of - ``combine_manual`` and ``combine_auto``. + This entire function is deprecated in favour of ``combine_manual`` and + ``combine_auto``. This method attempts to combine a list of datasets into a single entity by inspecting metadata and using a combination of concat and merge. @@ -577,47 +584,44 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', Dataset.merge """ + message = """In xarray version 0.14 `auto_combine` will be deprecated.""" + if concat_dim is '_not_supplied': concat_dim = _CONCAT_DIM_DEFAULT else: - message = """In xarray version 0.14 `auto_combine` will be deprecated, - and `open_mfdataset` will no longer accept a `concat_dim` - argument. To get equivalent behaviour from now on please use - the new `combine_manual` function instead (or the - `combine='manual'` option to open_mfdataset).""" - warnings.warn(message, FutureWarning) + message += dedent("""\ + Also `open_mfdataset` will no longer accept a `concat_dim` argument. + To get equivalent behaviour from now on please use the new + `combine_manual` function instead (or the `combine='manual'` option to + `open_mfdataset`).""") if _dimension_coords_exist(datasets): - message = """In xarray version 0.14 `auto_combine` will be deprecated. - The datasets supplied have global dimension coordinates. - You may want to use the new `combine_auto` function (or the - `combine='auto'` option to `open_mfdataset` to order the - datasets before concatenation. Alternatively, to continue - concatenating based on the order the datasets are supplied in - in future, please use the new `combine_manual` function (or - the `combine='manual'` option to open_mfdataset).""" - warnings.warn(message, FutureWarning) + message += dedent("""\ + The datasets supplied have global dimension coordinates. You may want + to use the new `combine_auto` function (or the `combine='auto'` option + to `open_mfdataset` to order the datasets before concatenation. + Alternatively, to continue concatenating based on the order the + datasets are supplied in in future, please use the new `combine_manual` + function (or the `combine='manual'` option to open_mfdataset).""") else: - message = """In xarray version 0.14 `auto_combine` will be deprecated. - The datasets supplied do not have global dimension - coordinates. In future, to continue concatenating without - supplying dimension coordinates, please use the new - `combine_manual` function (or the `combine='manual'` option - to open_mfdataset).""" - warnings.warn(message, FutureWarning) + message += dedent("""\ + The datasets supplied do not have global dimension coordinates. In + future, to continue concatenating without supplying dimension + coordinates, please use the new `combine_manual` function (or the + `combine='manual'` option to open_mfdataset.""") if _requires_concat_and_merge(datasets): manual_dims = [concat_dim].append(None) - message = """In xarray version 0.14 `auto_combine` will be deprecated. - The datasets supplied require both concatenation and merging. - From xarray version 0.14 this will operation will require - either using the new `manual_combine` function (or the - `combine='manual'` option to open_mfdataset), with - a nested list structure such that you can combine along the - dimensions {}. Alternatively if your datasets have global - dimension coordinates then you can use the new `combine_auto` - function.""".format(manual_dims) - warnings.warn(message, FutureWarning) + message += dedent("""\ + The datasets supplied require both concatenation and merging. From + xarray version 0.14 this will operation will require either using the + new `combine_manual` function (or the `combine='manual'` option to + open_mfdataset), with a nested list structure such that you can combine + along the dimensions {}. Alternatively if your datasets have global + dimension coordinates then you can use the new `combine_auto` function. + """.format(manual_dims)) + + warnings.warn(message, FutureWarning, stacklevel=2) return _old_auto_combine(datasets, concat_dim=concat_dim, compat=compat, data_vars=data_vars, diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 3959956997f..5698596dde7 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import, division, print_function - import warnings from collections import OrderedDict diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 60b98d6e676..a54efa8bddd 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -317,6 +317,9 @@ def test_manual_concat(self): actual = combine_manual([actual], concat_dim=None) assert_identical(expected, actual) + actual = combine_manual([actual], concat_dim='x') + assert_identical(expected, actual) + objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] actual = combine_manual(objs, concat_dim='x') expected = Dataset({'x': [0, 1, 2]}) @@ -333,6 +336,9 @@ def test_manual_concat(self): with pytest.raises(KeyError): combine_manual(objs, concat_dim='x') + def test_empty_input(self): + assert_identical(Dataset(), combine_manual([], concat_dim='x')) + # Fails because of concat's weird treatment of dimension coords, see #2975 @pytest.mark.xfail def test_manual_concat_too_many_dims_at_once(self): @@ -564,6 +570,9 @@ def test_combine_auto(self): with raises_regex(ValueError, 'Every dimension needs a coordinate'): combine_auto(objs) + def test_empty_input(self): + assert_identical(Dataset(), combine_auto([])) + def test_infer_order_from_coords(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] From 917ebeed5b956e9c6524cd64f065237cb597ed3f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 28 May 2019 17:36:33 +0100 Subject: [PATCH 91/96] Removed trailing whitespace --- xarray/core/combine.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 58fa8be11aa..48662960853 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -590,34 +590,34 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', concat_dim = _CONCAT_DIM_DEFAULT else: message += dedent("""\ - Also `open_mfdataset` will no longer accept a `concat_dim` argument. - To get equivalent behaviour from now on please use the new - `combine_manual` function instead (or the `combine='manual'` option to + Also `open_mfdataset` will no longer accept a `concat_dim` argument. + To get equivalent behaviour from now on please use the new + `combine_manual` function instead (or the `combine='manual'` option to `open_mfdataset`).""") if _dimension_coords_exist(datasets): message += dedent("""\ - The datasets supplied have global dimension coordinates. You may want - to use the new `combine_auto` function (or the `combine='auto'` option - to `open_mfdataset` to order the datasets before concatenation. - Alternatively, to continue concatenating based on the order the + The datasets supplied have global dimension coordinates. You may want + to use the new `combine_auto` function (or the `combine='auto'` option + to `open_mfdataset` to order the datasets before concatenation. + Alternatively, to continue concatenating based on the order the datasets are supplied in in future, please use the new `combine_manual` function (or the `combine='manual'` option to open_mfdataset).""") else: message += dedent("""\ - The datasets supplied do not have global dimension coordinates. In - future, to continue concatenating without supplying dimension - coordinates, please use the new `combine_manual` function (or the + The datasets supplied do not have global dimension coordinates. In + future, to continue concatenating without supplying dimension + coordinates, please use the new `combine_manual` function (or the `combine='manual'` option to open_mfdataset.""") if _requires_concat_and_merge(datasets): manual_dims = [concat_dim].append(None) message += dedent("""\ - The datasets supplied require both concatenation and merging. From - xarray version 0.14 this will operation will require either using the - new `combine_manual` function (or the `combine='manual'` option to + The datasets supplied require both concatenation and merging. From + xarray version 0.14 this will operation will require either using the + new `combine_manual` function (or the `combine='manual'` option to open_mfdataset), with a nested list structure such that you can combine - along the dimensions {}. Alternatively if your datasets have global + along the dimensions {}. Alternatively if your datasets have global dimension coordinates then you can use the new `combine_auto` function. """.format(manual_dims)) From 1e537baa3d53afc7fec9679f44f46f7298babc6b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 29 May 2019 09:39:12 +0100 Subject: [PATCH 92/96] Simplified error message for case combine_manual can't handle --- xarray/core/combine.py | 11 +++++------ xarray/tests/test_concat.py | 1 - 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 48662960853..a7f649b8e5b 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -215,12 +215,11 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all', coords=coords, fill_value=fill_value) except ValueError as err: if "encountered unexpected variable" in str(err): - raise ValueError("These objects cannot be combined along the " - "dimension {concat_dim} using only " - "xarray.concat, you must use " - "xarray.combine_auto instead, as this can " - "handle combining operations requiring both " - "concat and merge along the same dimension.") + raise ValueError("These objects cannot be combined using only " + "xarray.combine_manual, instead either use " + "xarray.combine_auto, or do it manually " + "with xarray.concat, xarray.merge and " + "xarray.align") else: raise else: diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index ba8832111b5..8a69e7d8ded 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -320,4 +320,3 @@ def test_concat_fill_value(self, fill_value): dims=['y', 'x'], coords={'x': [1, 2, 3]}) actual = concat((foo, bar), dim='y', fill_value=fill_value) assert_identical(actual, expected) - From 7d6845b6a743ded311c7e34a7b8d7a01603f59ff Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 29 May 2019 10:39:55 +0100 Subject: [PATCH 93/96] Removed filter for deprecation warnings, and added test for if user doesn't supply concat_dim --- xarray/backends/api.py | 6 +-- xarray/tests/test_backends.py | 80 +++++++++++++++++++++++------------ 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 51c5495b6c3..49dd79a73d2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -734,9 +734,9 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', # If combine='manual' then this creates a flat list which is easier to # iterate over, while saving the originally-supplied structure as "ids" if combine == 'manual': - if concat_dim is '__auto_combine__': - raise ValueError("Must supply concat_dim when using manual " - "combine") + if str(concat_dim) == '_not_supplied': + raise ValueError("Must supply concat_dim when using " + "combine='manual'") else: if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: concat_dim = [concat_dim] diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9aa7abc9880..54bb6abe6a3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2163,8 +2163,6 @@ def skip_if_not_engine(engine): pytest.importorskip(engine) -@pytest.mark.filterwarnings("ignore:In xarray version 0.14 `auto_combine` " - "will be deprecated") def test_open_mfdataset_manyfiles(readengine, nfiles, parallel, chunks, file_cache_maxsize): @@ -2188,7 +2186,8 @@ def test_open_mfdataset_manyfiles(readengine, nfiles, parallel, chunks, subds.to_netcdf(tmpfiles[ii], engine=writeengine) # check that calculation on opened datasets works properly - with open_mfdataset(tmpfiles, engine=readengine, parallel=parallel, + with open_mfdataset(tmpfiles, combine='manual', concat_dim='x', + engine=readengine, parallel=parallel, chunks=chunks) as actual: # check that using open_mfdataset returns dask arrays for variables @@ -2198,8 +2197,6 @@ def test_open_mfdataset_manyfiles(readengine, nfiles, parallel, chunks, @requires_scipy_or_netCDF4 -@pytest.mark.filterwarnings("ignore:In xarray version 0.14 `auto_combine` " - "will be deprecated") class TestOpenMFDatasetWithDataVarsAndCoordsKw: coord_name = 'lon' var_name = 'v1' @@ -2246,11 +2243,13 @@ def gen_datasets_with_common_coord_and_time(self): @pytest.mark.parametrize('opt', ['all', 'minimal', 'different']) def test_open_mfdataset_does_same_as_concat(self, opt): with self.setup_files_and_datasets() as (files, [ds1, ds2]): - with open_mfdataset(files, data_vars=opt) as ds: + with open_mfdataset(files, data_vars=opt, + combine='manual', concat_dim='t') as ds: kwargs = dict(data_vars=opt, dim='t') ds_expect = xr.concat([ds1, ds2], **kwargs) assert_identical(ds, ds_expect) - with open_mfdataset(files, coords=opt) as ds: + with open_mfdataset(files, coords=opt, + combine='manual', concat_dim='t') as ds: kwargs = dict(coords=opt, dim='t') ds_expect = xr.concat([ds1, ds2], **kwargs) assert_identical(ds, ds_expect) @@ -2260,7 +2259,8 @@ def test_common_coord_when_datavars_all(self): with self.setup_files_and_datasets() as (files, [ds1, ds2]): # open the files with the data_var option - with open_mfdataset(files, data_vars=opt) as ds: + with open_mfdataset(files, data_vars=opt, + combine='manual', concat_dim='t') as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -2277,7 +2277,8 @@ def test_common_coord_when_datavars_minimal(self): with self.setup_files_and_datasets() as (files, [ds1, ds2]): # open the files using data_vars option - with open_mfdataset(files, data_vars=opt) as ds: + with open_mfdataset(files, data_vars=opt, + combine='manual', concat_dim='t') as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -2293,20 +2294,20 @@ def test_invalid_data_vars_value_should_fail(self): with self.setup_files_and_datasets() as (files, _): with pytest.raises(ValueError): - with open_mfdataset(files, data_vars='minimum'): + with open_mfdataset(files, data_vars='minimum', + combine='auto'): pass # test invalid coord parameter with pytest.raises(ValueError): - with open_mfdataset(files, coords='minimum'): + with open_mfdataset(files, coords='minimum', + combine='auto'): pass @requires_dask @requires_scipy @requires_netCDF4 -@pytest.mark.filterwarnings("ignore:In xarray version 0.14 `auto_combine` " - "will be deprecated") class TestDask(DatasetIOBase): @contextlib.contextmanager def create_store(self): @@ -2368,11 +2369,13 @@ def test_open_mfdataset(self): with create_tmp_file() as tmp2: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) assert_identical(original, actual) - with open_mfdataset([tmp1, tmp2], + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual', chunks={'x': 3}) as actual: assert actual.foo.variable.data.chunks == ((3, 2, 3, 2),) @@ -2422,7 +2425,8 @@ def test_open_mfdataset_pathlib(self): tmp2 = Path(tmp2) original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert_identical(original, actual) @requires_pathlib @@ -2457,7 +2461,8 @@ def test_open_mfdataset_2(self): original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert_identical(original, actual) def test_attrs_mfdataset(self): @@ -2470,7 +2475,8 @@ def test_attrs_mfdataset(self): ds2.attrs['test2'] = 'bar' ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: # presumes that attributes inherited from # first dataset loaded assert actual.test1 == ds1.test1 @@ -2490,6 +2496,17 @@ def test_open_mfdataset_auto_combine(self): with open_mfdataset([tmp2, tmp1], combine='auto') as actual: assert_identical(original, actual) + def test_open_mfdataset_combine_manual_no_concat_dim(self): + original = Dataset({'foo': ('x', np.random.randn(10)), + 'x': np.arange(10)}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + original.isel(x=slice(5)).to_netcdf(tmp1) + original.isel(x=slice(5, 10)).to_netcdf(tmp2) + + with raises_regex(ValueError, 'Must supply concat_dim'): + open_mfdataset([tmp2, tmp1], combine='manual') + def test_preprocess_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: @@ -2499,7 +2516,8 @@ def preprocess(ds): return ds.assign_coords(z=0) expected = preprocess(original) - with open_mfdataset(tmp, preprocess=preprocess) as actual: + with open_mfdataset(tmp, preprocess=preprocess, + combine='auto') as actual: assert_identical(expected, actual) def test_save_mfdataset_roundtrip(self): @@ -2509,7 +2527,8 @@ def test_save_mfdataset_roundtrip(self): with create_tmp_file() as tmp1: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert_identical(actual, original) def test_save_mfdataset_invalid(self): @@ -2535,14 +2554,15 @@ def test_save_mfdataset_pathlib_roundtrip(self): tmp1 = Path(tmp1) tmp2 = Path(tmp2) save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert_identical(actual, original) def test_open_and_do_math(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset(tmp) as ds: + with open_mfdataset(tmp, combine='auto') as ds: actual = 1.0 * ds assert_allclose(original, actual, decode_bytes=False) @@ -2552,7 +2572,8 @@ def test_open_mfdataset_concat_dim_none(self): data = Dataset({'x': 0}) data.to_netcdf(tmp1) Dataset({'x': np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim=None, + combine='manual') as actual: assert_identical(data, actual) def test_open_dataset(self): @@ -2579,7 +2600,8 @@ def test_open_single_dataset(self): {'baz': [100]}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset([tmp], concat_dim=dim) as actual: + with open_mfdataset([tmp], concat_dim=dim, + combine='manual') as actual: assert_identical(expected, actual) def test_open_multi_dataset(self): @@ -2602,7 +2624,8 @@ def test_open_multi_dataset(self): create_tmp_file() as tmp2: original.to_netcdf(tmp1) original.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim=dim) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim=dim, + combine='manual') as actual: assert_identical(expected, actual) def test_dask_roundtrip(self): @@ -2621,10 +2644,10 @@ def test_deterministic_names(self): with create_tmp_file() as tmp: data = create_test_data() data.to_netcdf(tmp) - with open_mfdataset(tmp) as ds: + with open_mfdataset(tmp, combine='auto') as ds: original_names = dict((k, v.data.name) for k, v in ds.data_vars.items()) - with open_mfdataset(tmp) as ds: + with open_mfdataset(tmp, combine='auto') as ds: repeat_names = dict((k, v.data.name) for k, v in ds.data_vars.items()) for var_name, dask_name in original_names.items(): @@ -2654,7 +2677,8 @@ def test_save_mfdataset_compute_false_roundtrip(self): engine=self.engine, compute=False) assert isinstance(delayed_obj, Delayed) delayed_obj.compute() - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x') as actual: assert_identical(actual, original) def test_load_dataset(self): From 508347114cb00fe0abcf1ca7c88b750aa9e81276 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 21 Jun 2019 14:06:36 +0100 Subject: [PATCH 94/96] Simple fixes suggested by shoyer --- xarray/core/combine.py | 7 ++----- xarray/tests/test_concat.py | 2 -- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index a7f649b8e5b..23b403eaf7f 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -585,7 +585,7 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', message = """In xarray version 0.14 `auto_combine` will be deprecated.""" - if concat_dim is '_not_supplied': + if concat_dim == '_not_supplied': concat_dim = _CONCAT_DIM_DEFAULT else: message += dedent("""\ @@ -668,10 +668,7 @@ def _requires_concat_and_merge(datasets): sorted_datasets = sorted(datasets, key=vars_as_keys) grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) - if len(list(grouped_by_vars)) > 1: - return True - else: - return False + return len(list(grouped_by_vars)) > 1 def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 8a69e7d8ded..31d5b9c6f72 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import, division, print_function - from copy import deepcopy import numpy as np From 4cc70ae830b70193d9442051ee23b50d27c0ce7b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 21 Jun 2019 16:12:46 +0100 Subject: [PATCH 95/96] Change deprecation warning behaviour --- xarray/core/combine.py | 27 ++++++++++----------------- xarray/tests/test_combine.py | 2 ++ 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 23b403eaf7f..9db07a6f8ad 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -583,12 +583,14 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', Dataset.merge """ - message = """In xarray version 0.14 `auto_combine` will be deprecated.""" + basic_msg = """In xarray version 0.14 `auto_combine` will be deprecated.""" + warnings.warn(basic_msg, FutureWarning, stacklevel=2) if concat_dim == '_not_supplied': concat_dim = _CONCAT_DIM_DEFAULT + message = '' else: - message += dedent("""\ + message = dedent("""\ Also `open_mfdataset` will no longer accept a `concat_dim` argument. To get equivalent behaviour from now on please use the new `combine_manual` function instead (or the `combine='manual'` option to @@ -637,25 +639,16 @@ def _dimension_coords_exist(datasets): sorted_datasets = sorted(datasets, key=vars_as_keys) grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) - # Perform the multidimensional combine on each group of data variables - # before merging back together - concatenated_grouped_by_data_vars = [] + # Simulates performing the multidimensional combine on each group of data + # variables before merging back together try: for vars, datasets_with_same_vars in grouped_by_vars: _infer_concat_order_from_coords(list(datasets_with_same_vars)) return True - except ValueError as err: - no_dimension_coords_errs = ["Every dimension needs a coordinate", - "neither monotonically increasing nor", - "Cannot handle size zero", - "Could not find any dimension coordinates"] - if any(message in str(err) for message in no_dimension_coords_errs): - # The ValueError just means that the datasets don't have - # global dimension coordinates - return False - else: - # There is a different problem - raise err + except ValueError: + # ValueError means datasets don't have global dimension coordinates + # Or something else went wrong in trying to determine them + return False def _requires_concat_and_merge(datasets): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index a54efa8bddd..b1a03d5cee5 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -620,6 +620,8 @@ def test_check_for_impossible_ordering(self): @pytest.mark.filterwarnings("ignore:In xarray version 0.14 `auto_combine` " "will be deprecated") +@pytest.mark.filterwarnings("ignore:Also `open_mfdataset` will no longer") +@pytest.mark.filterwarnings("ignore:The datasets supplied") class TestAutoCombineOldAPI: """ Set of tests which check that old 1-dimensional auto_combine behaviour is From 357531fdb796064dd2404016bcd0ea6e68001339 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 25 Jun 2019 14:42:52 +0100 Subject: [PATCH 96/96] linting --- asv_bench/benchmarks/combine.py | 2 +- xarray/backends/api.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index 6ce55b17314..8670760abc1 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -9,7 +9,7 @@ def setup(self): """Create 4 datasets with two different variables""" t_size, x_size, y_size = 100, 900, 800 - t, x, y = np.arange(t_size), np.arange(x_size), np.arange(y_size) + t = np.arange(t_size) data = np.random.randn(t_size, x_size, y_size) self.dsA0 = xr.Dataset( diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 49dd79a73d2..f3bab5d084d 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -770,19 +770,19 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', # Combine all datasets, closing them in case of a ValueError try: - if combine is '_old_auto': + if combine == '_old_auto': # Use the old auto_combine for now # Remove this after deprecation cycle from #2616 is complete combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, data_vars=data_vars, coords=coords) - elif combine is 'manual': + elif combine == 'manual': # Combined nested list by successive concat and merge operations # along each dimension, using structure given by "ids" combined = _manual_combine(datasets, concat_dims=concat_dim, compat=compat, data_vars=data_vars, coords=coords, ids=ids) - elif combine is 'auto': + elif combine == 'auto': # Redo ordering from coordinates, ignoring how they were ordered # previously combined = combine_auto(datasets, compat=compat,