From 66ab0ae4f3aa3c461357a5a895405e81357796b1 Mon Sep 17 00:00:00 2001 From: Romain Martinez Date: Sat, 12 Sep 2020 12:50:17 -0400 Subject: [PATCH 01/22] add pyomeca to xarray related project (#4416) --- doc/related-projects.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/related-projects.rst b/doc/related-projects.rst index 256af3c1c16..a1d9d8cb412 100644 --- a/doc/related-projects.rst +++ b/doc/related-projects.rst @@ -58,6 +58,7 @@ Other domains ~~~~~~~~~~~~~ - `ptsa `_: EEG Time Series Analysis - `pycalphad `_: Computational Thermodynamics in Python +- `pyomeca `_: Python framework for biomechanical analysis Extend xarray capabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~ From 59f57f3e410aca19a722c4a0d84359bde9852fbf Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 15 Sep 2020 18:33:29 -0700 Subject: [PATCH 02/22] Fix indexing with datetime64[ns] with pandas=1.1 (#4292) * Fix indexing with datetime64[ns] with pandas=1.1 Fixes #4283 The underlying issue is that calling `.item()` on a NumPy array with `dtype=datetime64[ns]` returns an _integer_, rather than an `np.datetime64 scalar. This is somewhat baffling but works this way because `.item()` returns native Python types, but `datetime.datetime` doesn't support nanosecond precision. `pandas.Index.get_loc` used to support these integers, but now is more strict. Hence we get errors. We can fix this by using `array[()]` to convert 0d arrays into NumPy scalars instead of calling `array.item()`. I've added a crude regression test. There may well be a better way to test this but I haven't figured it out yet. * lint fix * add a test checking the datetime indexer * use label.item() for non-datetime / timedelta labels * unpin pandas in the docs * ignore the future warning about deprecated arguments to pandas.Grouper * Update xarray/core/indexing.py Co-authored-by: keewis * Add whatsnew note Co-authored-by: Keewis Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Co-authored-by: keewis --- ci/requirements/doc.yml | 4 +--- doc/whats-new.rst | 3 +++ xarray/core/common.py | 22 +++++++++++++++------- xarray/core/indexing.py | 10 +++++----- xarray/tests/test_dataarray.py | 8 +++++++- xarray/tests/test_indexing.py | 9 +++++++++ 6 files changed, 40 insertions(+), 16 deletions(-) diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml index d1a9c329d9f..5206b81518a 100644 --- a/ci/requirements/doc.yml +++ b/ci/requirements/doc.yml @@ -18,9 +18,7 @@ dependencies: - netcdf4>=1.5 - numba - numpy>=1.17 - # FIXME https://github.com/pydata/xarray/issues/4287 - # - pandas>=1.0 - - pandas=1.0 + - pandas>=1.0 - rasterio>=1.1 - seaborn - setuptools diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 74619529144..d8b1fc2fba9 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -86,6 +86,9 @@ Bug fixes By `Jens Svensmark `_ - Fix incorrect legend labels for :py:meth:`Dataset.plot.scatter` (:issue:`4126`). By `Peter Hausamann `_. +- Fix indexing with datetime64 scalars with pandas 1.1 (:issue:`4283`). + By `Stephan Hoyer `_ and + `Justus Magin `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/common.py b/xarray/core/common.py index b48a2f56a0d..b7ae9121700 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1126,14 +1126,22 @@ def resample( dim_name = dim dim_coord = self[dim] - if isinstance(self.indexes[dim_name], CFTimeIndex): - from .resample_cftime import CFTimeGrouper - - grouper = CFTimeGrouper(freq, closed, label, base, loffset) - else: - grouper = pd.Grouper( - freq=freq, closed=closed, label=label, base=base, loffset=loffset + # TODO: remove once pandas=1.1 is the minimum required version + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r"'(base|loffset)' in .resample\(\) and in Grouper\(\) is deprecated.", + category=FutureWarning, ) + + if isinstance(self.indexes[dim_name], CFTimeIndex): + from .resample_cftime import CFTimeGrouper + + grouper = CFTimeGrouper(freq, closed, label, base, loffset) + else: + grouper = pd.Grouper( + freq=freq, closed=closed, label=label, base=base, loffset=loffset + ) group = DataArray( dim_coord, coords=dim_coord.coords, dims=dim_coord.dims, name=RESAMPLE_DIM ) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 66c62653139..da0bf66944f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -178,8 +178,10 @@ def convert_label_indexer(index, label, index_name="", method=None, tolerance=No else _asarray_tuplesafe(label) ) if label.ndim == 0: + # see https://github.com/pydata/xarray/pull/4292 for details + label_value = label[()] if label.dtype.kind in "mM" else label.item() if isinstance(index, pd.MultiIndex): - indexer, new_index = index.get_loc_level(label.item(), level=0) + indexer, new_index = index.get_loc_level(label_value, level=0) elif isinstance(index, pd.CategoricalIndex): if method is not None: raise ValueError( @@ -189,11 +191,9 @@ def convert_label_indexer(index, label, index_name="", method=None, tolerance=No raise ValueError( "'tolerance' is not a valid kwarg when indexing using a CategoricalIndex." ) - indexer = index.get_loc(label.item()) + indexer = index.get_loc(label_value) else: - indexer = index.get_loc( - label.item(), method=method, tolerance=tolerance - ) + indexer = index.get_loc(label_value, method=method, tolerance=tolerance) elif label.dtype.kind == "b": indexer = label else: diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index a22ed58c9bc..5e0fe13ea52 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -938,7 +938,7 @@ def test_sel_invalid_slice(self): with raises_regex(ValueError, "cannot use non-scalar arrays"): array.sel(x=slice(array.x)) - def test_sel_dataarray_datetime(self): + def test_sel_dataarray_datetime_slice(self): # regression test for GH1240 times = pd.date_range("2000-01-01", freq="D", periods=365) array = DataArray(np.arange(365), [("time", times)]) @@ -1078,6 +1078,12 @@ def test_loc(self): assert_identical(da[:3, :4], da.loc[["a", "b", "c"], np.arange(4)]) assert_identical(da[:, :4], da.loc[:, self.ds["y"] < 4]) + def test_loc_datetime64_value(self): + # regression test for https://github.com/pydata/xarray/issues/4283 + t = np.array(["2017-09-05T12", "2017-09-05T15"], dtype="datetime64[ns]") + array = DataArray(np.ones(t.shape), dims=("time",), coords=(t,)) + assert_identical(array.loc[{"time": t[0]}], array[0]) + def test_loc_assign(self): self.ds["x"] = ("x", np.array(list("abcdefghij"))) da = self.ds["foo"] diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index d7ed16b9573..4ef7536e1f2 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -86,6 +86,15 @@ def test_convert_label_indexer(self): with pytest.raises(IndexError): indexing.convert_label_indexer(mindex, (slice(None), 1, "no_level")) + def test_convert_label_indexer_datetime(self): + index = pd.to_datetime(["2000-01-01", "2001-01-01", "2002-01-01"]) + actual = indexing.convert_label_indexer(index, "2001-01-01") + expected = (1, None) + assert actual == expected + + actual = indexing.convert_label_indexer(index, index.to_numpy()[1]) + assert actual == expected + def test_convert_unsorted_datetime_index_raises(self): index = pd.to_datetime(["2001", "2000", "2002"]) with pytest.raises(KeyError): From b0d8d93665dbb6d28e33dfd28ad27036c20c60bf Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 17 Sep 2020 14:59:08 +0200 Subject: [PATCH 03/22] fix doc dataarray to netcdf (#4424) * fix doc dataarray to netcdf * codeblock --- xarray/core/dataarray.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 86cb7ad988e..94b7f702920 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2490,15 +2490,19 @@ def to_masked_array(self, copy: bool = True) -> np.ma.MaskedArray: def to_netcdf(self, *args, **kwargs) -> Union[bytes, "Delayed", None]: """Write DataArray contents to a netCDF file. - All parameters are passed directly to `xarray.Dataset.to_netcdf`. + All parameters are passed directly to :py:meth:`xarray.Dataset.to_netcdf`. Notes ----- Only xarray.Dataset objects can be written to netCDF files, so the xarray.DataArray is converted to a xarray.Dataset object containing a single variable. If the DataArray has no name, or if the - name is the same as a co-ordinate name, then it is given the name - '__xarray_dataarray_variable__'. + name is the same as a coordinate name, then it is given the name + ``"__xarray_dataarray_variable__"``. + + See Also + -------- + Dataset.to_netcdf """ from ..backends.api import DATAARRAY_NAME, DATAARRAY_VARIABLE From 9a8a62ba551e737dc87e39aded2f7cc788ff118d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 17 Sep 2020 18:19:22 -0500 Subject: [PATCH 04/22] Fix optimize for chunked DataArray (#4432) Previously we generated in invalidate Dask task graph, becuase the lines removed here dropped keys that were referenced elsewhere in the task graph. The original implementation had a comment indicating that this was to cull: https://github.com/pydata/xarray/blame/502a988ad5b87b9f3aeec3033bf55c71272e1053/xarray/core/variable.py#L384 Just spot-checking things, I think we're OK here though. Something like `dask.visualize(arr[[0]], optimize_graph=True)` indicates that we're OK. Closes https://github.com/pydata/xarray/issues/3698 Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- doc/whats-new.rst | 2 ++ xarray/core/variable.py | 3 --- xarray/tests/test_dask.py | 7 +++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d8b1fc2fba9..60a53512622 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -84,11 +84,13 @@ Bug fixes - Fix `KeyError` when doing linear interpolation to an nd `DataArray` that contains NaNs (:pull:`4233`). By `Jens Svensmark `_ +- Fix ``dask.optimize`` on ``DataArray`` producing an invalid Dask task graph (:issue:`3698`) - Fix incorrect legend labels for :py:meth:`Dataset.plot.scatter` (:issue:`4126`). By `Peter Hausamann `_. - Fix indexing with datetime64 scalars with pandas 1.1 (:issue:`4283`). By `Stephan Hoyer `_ and `Justus Magin `_. + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 6de00ee882a..c55e61cb816 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -501,9 +501,6 @@ def __dask_postpersist__(self): @staticmethod def _dask_finalize(results, array_func, array_args, dims, attrs, encoding): - if isinstance(results, dict): # persist case - name = array_args[0] - results = {k: v for k, v in results.items() if k[0] == name} data = array_func(results, *array_args) return Variable(dims, data, attrs=attrs, encoding=encoding) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 46685a29a47..489bf09fa3c 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1607,3 +1607,10 @@ def test_more_transforms_pass_lazy_array_equiv(map_da, map_ds): assert_equal(map_da._from_temp_dataset(map_da._to_temp_dataset()), map_da) assert_equal(map_da.astype(map_da.dtype), map_da) assert_equal(map_da.transpose("y", "x", transpose_coords=False).cxy, map_da.cxy) + + +def test_optimize(): + a = dask.array.ones((10, 5), chunks=(1, 3)) + arr = xr.DataArray(a).chunk(5) + (arr2,) = dask.optimize(arr) + arr2.compute() From 902f1fcd9d455d70af6dd4cd39b403fe05dda993 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Thu, 17 Sep 2020 20:06:45 -0700 Subject: [PATCH 05/22] Revert "Fix optimize for chunked DataArray (#4432)" (#4434) This reverts commit 9a8a62ba551e737dc87e39aded2f7cc788ff118d. --- doc/whats-new.rst | 2 -- xarray/core/variable.py | 3 +++ xarray/tests/test_dask.py | 7 ------- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 60a53512622..d8b1fc2fba9 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -84,13 +84,11 @@ Bug fixes - Fix `KeyError` when doing linear interpolation to an nd `DataArray` that contains NaNs (:pull:`4233`). By `Jens Svensmark `_ -- Fix ``dask.optimize`` on ``DataArray`` producing an invalid Dask task graph (:issue:`3698`) - Fix incorrect legend labels for :py:meth:`Dataset.plot.scatter` (:issue:`4126`). By `Peter Hausamann `_. - Fix indexing with datetime64 scalars with pandas 1.1 (:issue:`4283`). By `Stephan Hoyer `_ and `Justus Magin `_. - Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c55e61cb816..6de00ee882a 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -501,6 +501,9 @@ def __dask_postpersist__(self): @staticmethod def _dask_finalize(results, array_func, array_args, dims, attrs, encoding): + if isinstance(results, dict): # persist case + name = array_args[0] + results = {k: v for k, v in results.items() if k[0] == name} data = array_func(results, *array_args) return Variable(dims, data, attrs=attrs, encoding=encoding) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 489bf09fa3c..46685a29a47 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1607,10 +1607,3 @@ def test_more_transforms_pass_lazy_array_equiv(map_da, map_ds): assert_equal(map_da._from_temp_dataset(map_da._to_temp_dataset()), map_da) assert_equal(map_da.astype(map_da.dtype), map_da) assert_equal(map_da.transpose("y", "x", transpose_coords=False).cxy, map_da.cxy) - - -def test_optimize(): - a = dask.array.ones((10, 5), chunks=(1, 3)) - arr = xr.DataArray(a).chunk(5) - (arr2,) = dask.optimize(arr) - arr2.compute() From fd3eb216dc5758c52cb5fe7dc494c625c15c4fc8 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Fri, 18 Sep 2020 08:23:32 -0700 Subject: [PATCH 06/22] Clearer Vectorized Indexing example (#4433) * Clearer Vectorized Indexing example * Feedback from @alexamici --- doc/indexing.rst | 2 +- doc/whats-new.rst | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index af8e44fb80b..58064582354 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -339,7 +339,7 @@ MATLAB, or after using the :py:func:`numpy.ix_` helper: coords={"x": [0, 1, 2], "y": ["a", "b", "c", "d"]}, ) da - da[[0, 1], [1, 1]] + da[[0, 2, 2], [1, 3]] For more flexibility, you can supply :py:meth:`~xarray.DataArray` objects as indexers. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d8b1fc2fba9..f624e89019d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -99,6 +99,8 @@ Documentation By `Sander van Rijn `_ - Update the contributing guide to use merges instead of rebasing and state that we squash-merge. (:pull:`4355`) By `Justus Magin `_. +- Updated Vectorized Indexing to a clearer example. + By `Maximilian Roos `_ Internal Changes ~~~~~~~~~~~~~~~~ From b2c1550cffcac99ba48bb3d99751892eb150a6a6 Mon Sep 17 00:00:00 2001 From: keewis Date: Fri, 18 Sep 2020 23:00:09 +0200 Subject: [PATCH 07/22] Keep the original ordering of the coordinates (#4409) * un-xfail the pint assert_allclose and assert_duckarray_equal tests * update the required version of pint * keep the order of the coordinates * fix the groupby doctest * keep the order of the dims in concat * don't compute a set difference if we're filtering anyways * sort names instead of potentially dropping items * Apply suggestions from code review * sort in DatasetCoordinates.to_dataset instead of in Dataset._copy_listed * update whats-new.rst * filter _variables instead of sorting _coord_name --- ci/requirements/py36-min-nep18.yml | 2 +- doc/whats-new.rst | 8 ++++++-- xarray/core/concat.py | 6 +++++- xarray/core/coordinates.py | 4 +++- xarray/core/dataset.py | 14 +++++++++----- xarray/tests/test_testing.py | 14 ++------------ 6 files changed, 26 insertions(+), 22 deletions(-) diff --git a/ci/requirements/py36-min-nep18.yml b/ci/requirements/py36-min-nep18.yml index 17aae6932ac..14982c1d5e7 100644 --- a/ci/requirements/py36-min-nep18.yml +++ b/ci/requirements/py36-min-nep18.yml @@ -10,7 +10,7 @@ dependencies: - distributed=2.9 - numpy=1.17 - pandas=0.25 - - pint=0.13 + - pint=0.15 - pip - pytest - pytest-cov diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f624e89019d..4b451fcbc18 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,7 +33,7 @@ New Features now accept more than 1 dimension. (:pull:`4219`) By `Keisuke Fujii `_. - ``min_count`` can be supplied to reductions such as ``.sum`` when specifying - multiple dimension to reduce over. (:pull:`4356`) + multiple dimension to reduce over. (:pull:`4356`) By `Maximilian Roos `_. - :py:func:`xarray.cov` and :py:func:`xarray.corr` now handle missing values. (:pull:`4351`) By `Maximilian Roos `_. @@ -77,7 +77,7 @@ Bug fixes and :py:meth:`DataArray.str.wrap` (:issue:`4334`). By `Mathias Hauser `_. - Fixed overflow issue causing incorrect results in computing means of :py:class:`cftime.datetime` arrays (:issue:`4341`). By `Spencer Clark `_. -- Fixed :py:meth:`Dataset.coarsen`, :py:meth:`DataArray.coarsen` dropping attributes on original object (:issue:`4120`, :pull:`4360`). by `Julia Kent `_. +- Fixed :py:meth:`Dataset.coarsen`, :py:meth:`DataArray.coarsen` dropping attributes on original object (:issue:`4120`, :pull:`4360`). By `Julia Kent `_. - fix the signature of the plot methods. (:pull:`4359`) By `Justus Magin `_. - Fix :py:func:`xarray.apply_ufunc` with ``vectorize=True`` and ``exclude_dims`` (:issue:`3890`). By `Mathias Hauser `_. @@ -86,6 +86,8 @@ Bug fixes By `Jens Svensmark `_ - Fix incorrect legend labels for :py:meth:`Dataset.plot.scatter` (:issue:`4126`). By `Peter Hausamann `_. +- Avoid relying on :py:class:`set` objects for the ordering of the coordinates (:pull:`4409`) + By `Justus Magin `_. - Fix indexing with datetime64 scalars with pandas 1.1 (:issue:`4283`). By `Stephan Hoyer `_ and `Justus Magin `_. @@ -99,6 +101,8 @@ Documentation By `Sander van Rijn `_ - Update the contributing guide to use merges instead of rebasing and state that we squash-merge. (:pull:`4355`) By `Justus Magin `_. +- Make sure the examples from the docstrings actually work (:pull:`4408`). + By `Justus Magin `_. - Updated Vectorized Indexing to a clearer example. By `Maximilian Roos `_ diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 54bc686a322..0955a95fa8b 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -349,7 +349,11 @@ def _parse_datasets( all_coord_names.update(ds.coords) data_vars.update(ds.data_vars) - for dim in set(ds.dims) - dims: + # preserves ordering of dimensions + for dim in ds.dims: + if dim in dims: + continue + if dim not in dim_coords: dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index a4b8ca478eb..846e4044a2c 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -215,7 +215,9 @@ def __getitem__(self, key: Hashable) -> "DataArray": def to_dataset(self) -> "Dataset": """Convert these coordinates into a new Dataset""" - return self._data._copy_listed(self._names) + + names = [name for name in self._data._variables if name in self._names] + return self._data._copy_listed(names) def _update_coords( self, coords: Dict[Hashable, Variable], indexes: Mapping[Hashable, pd.Index] diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 825d2044a12..ce72d4a5886 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1142,7 +1142,11 @@ def _copy_listed(self, names: Iterable[Hashable]) -> "Dataset": dims = {k: self.dims[k] for k in needed_dims} - for k in self._coord_names: + # preserves ordering of coordinates + for k in self._variables: + if k not in self._coord_names: + continue + if set(self.variables[k].dims) <= needed_dims: variables[k] = self._variables[k] coord_names.add(k) @@ -5729,10 +5733,10 @@ def filter_by_attrs(self, **kwargs): Dimensions: (time: 3, x: 2, y: 2) Coordinates: - reference_time datetime64[ns] 2014-09-05 + lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 2014-09-06 2014-09-07 2014-09-08 - lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 + reference_time datetime64[ns] 2014-09-05 Dimensions without coordinates: x, y Data variables: precipitation (x, y, time) float64 5.68 9.256 0.7104 ... 7.992 4.615 7.805 @@ -5742,10 +5746,10 @@ def filter_by_attrs(self, **kwargs): Dimensions: (time: 3, x: 2, y: 2) Coordinates: - reference_time datetime64[ns] 2014-09-05 + lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 2014-09-06 2014-09-07 2014-09-08 - lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 + reference_time datetime64[ns] 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 29.11 18.2 22.83 ... 18.28 16.15 26.63 diff --git a/xarray/tests/test_testing.py b/xarray/tests/test_testing.py index 0f2ae8b31d4..30ea6aaaee9 100644 --- a/xarray/tests/test_testing.py +++ b/xarray/tests/test_testing.py @@ -70,12 +70,7 @@ def test_assert_allclose(obj1, obj2): pytest.param( quantity, id="pint", - marks=[ - pytest.mark.skipif(not has_pint, reason="requires pint"), - pytest.mark.xfail( - reason="inconsistencies in the return value of pint's implementation of eq" - ), - ], + marks=pytest.mark.skipif(not has_pint, reason="requires pint"), ), ), ) @@ -115,12 +110,7 @@ def test_assert_duckarray_equal_failing(duckarray, obj1, obj2): pytest.param( quantity, id="pint", - marks=[ - pytest.mark.skipif(not has_pint, reason="requires pint"), - pytest.mark.xfail( - reason="inconsistencies in the return value of pint's implementation of eq" - ), - ], + marks=pytest.mark.skipif(not has_pint, reason="requires pint"), ), ), ) From 2ed6d57fa5e14e87e83c8194e619538f6edcd90a Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 18 Sep 2020 15:31:08 -0700 Subject: [PATCH 08/22] Fix for h5py deepcopy issues (#4426) * Potential fix for h5py deepcopy issues * lint * Add unit test * blacker than the blackest black Co-authored-by: dcherian --- xarray/core/indexing.py | 6 +++++ xarray/tests/test_backends.py | 43 ++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index da0bf66944f..9627f431cb6 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -664,6 +664,12 @@ def __setitem__(self, key, value): self._ensure_copied() self.array[key] = value + def __deepcopy__(self, memo): + # CopyOnWriteArray is used to wrap backend array objects, which might + # point to files on disk, so we can't rely on the default deepcopy + # implementation. + return type(self)(self.array) + class MemoryCachedArray(ExplicitlyIndexedNDArrayMixin): __slots__ = ("array",) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fe93f5a9777..33ac26cfd39 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1441,7 +1441,10 @@ def test_autoclose_future_warning(self): with self.open(tmp_file, autoclose=True) as actual: assert_identical(data, actual) - def test_already_open_dataset(self): + +@requires_netCDF4 +class TestNetCDF4AlreadyOpen: + def test_base_case(self): with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode="w") as nc: v = nc.createVariable("x", "int") @@ -1453,7 +1456,7 @@ def test_already_open_dataset(self): expected = Dataset({"x": ((), 42)}) assert_identical(expected, ds) - def test_already_open_dataset_group(self): + def test_group(self): with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode="w") as nc: group = nc.createGroup("g") @@ -1476,6 +1479,21 @@ def test_already_open_dataset_group(self): with pytest.raises(ValueError, match="must supply a root"): backends.NetCDF4DataStore(nc.groups["g"], group="g") + def test_deepcopy(self): + # regression test for https://github.com/pydata/xarray/issues/4425 + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, mode="w") as nc: + nc.createDimension("x", 10) + v = nc.createVariable("y", np.int32, ("x",)) + v[:] = np.arange(10) + + h5 = nc4.Dataset(tmp_file, mode="r") + store = backends.NetCDF4DataStore(h5) + with open_dataset(store) as ds: + copied = ds.copy(deep=True) + expected = Dataset({"y": ("x", np.arange(10))}) + assert_identical(expected, copied) + @requires_netCDF4 @requires_dask @@ -2422,7 +2440,10 @@ def test_dump_encodings_h5py(self): assert actual.x.encoding["compression"] == "lzf" assert actual.x.encoding["compression_opts"] is None - def test_already_open_dataset_group(self): + +@requires_h5netcdf +class TestH5NetCDFAlreadyOpen: + def test_open_dataset_group(self): import h5netcdf with create_tmp_file() as tmp_file: @@ -2443,6 +2464,22 @@ def test_already_open_dataset_group(self): expected = Dataset({"x": ((), 42)}) assert_identical(expected, ds) + def test_deepcopy(self): + import h5netcdf + + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, mode="w") as nc: + nc.createDimension("x", 10) + v = nc.createVariable("y", np.int32, ("x",)) + v[:] = np.arange(10) + + h5 = h5netcdf.File(tmp_file, mode="r") + store = backends.H5NetCDFStore(h5) + with open_dataset(store) as ds: + copied = ds.copy(deep=True) + expected = Dataset({"y": ("x", np.arange(10))}) + assert_identical(expected, copied) + @requires_h5netcdf class TestH5NetCDFFileObject(TestH5NetCDFData): From 7d389dbcf59c405b4b1634f32f57eb2362779685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 19 Sep 2020 05:01:27 +0200 Subject: [PATCH 09/22] preserve original dimension, coordinate and variable order in ``concat`` (#4419) * preserve original dimension, coordinate and variable order in ``concat`` * only re-insert into result_vars if already in * add test to check if dimension and coordinate order is preserved in concat * black style * Update xarray/tests/test_concat.py Co-authored-by: keewis * Update xarray/tests/test_concat.py * add whats-new.rst entry * fix scalar variable problem in test_concat Co-authored-by: Deepak Cherian Co-authored-by: keewis --- doc/whats-new.rst | 2 ++ xarray/core/concat.py | 3 +++ xarray/tests/test_concat.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4b451fcbc18..78e49f711e1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -86,6 +86,8 @@ Bug fixes By `Jens Svensmark `_ - Fix incorrect legend labels for :py:meth:`Dataset.plot.scatter` (:issue:`4126`). By `Peter Hausamann `_. +- Preserve dimension and coordinate order during :py:func:`xarray.concat` (:issue:`2811`, :issue:`4072`, :pull:`4419`). + By `Kai Mühlbauer `_. - Avoid relying on :py:class:`set` objects for the ordering of the coordinates (:pull:`4409`) By `Justus Magin `_. - Fix indexing with datetime64 scalars with pandas 1.1 (:issue:`4283`). diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 0955a95fa8b..3a39369e793 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -463,6 +463,9 @@ def ensure_common_dims(vars): combined = concat_vars(vars, dim, positions) assert isinstance(combined, Variable) result_vars[k] = combined + elif k in result_vars: + # preserves original variable order + result_vars[k] = result_vars.pop(k) result = Dataset(result_vars, attrs=result_attrs) absent_coord_names = coord_names - set(result.variables) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 07ae83d3862..0d5507b6879 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -558,3 +558,36 @@ def test_concat_merge_single_non_dim_coord(): for coords in ["different", "all"]: with raises_regex(ValueError, "'y' not present in all datasets"): concat([da1, da2, da3], dim="x") + + +def test_concat_preserve_coordinate_order(): + x = np.arange(0, 5) + y = np.arange(0, 10) + time = np.arange(0, 4) + data = np.zeros((4, 10, 5), dtype=bool) + + ds1 = Dataset( + {"data": (["time", "y", "x"], data[0:2])}, + coords={"time": time[0:2], "y": y, "x": x}, + ) + ds2 = Dataset( + {"data": (["time", "y", "x"], data[2:4])}, + coords={"time": time[2:4], "y": y, "x": x}, + ) + + expected = Dataset( + {"data": (["time", "y", "x"], data)}, + coords={"time": time, "y": y, "x": x}, + ) + + actual = concat([ds1, ds2], dim="time") + + # check dimension order + for act, exp in zip(actual.dims, expected.dims): + assert act == exp + assert actual.dims[act] == expected.dims[exp] + + # check coordinate order + for act, exp in zip(actual.coords, expected.coords): + assert act == exp + assert_identical(actual.coords[act], expected.coords[exp]) From 0af238c153f4a7c8b27d41893c9893e9db2b72f0 Mon Sep 17 00:00:00 2001 From: keewis Date: Sat, 19 Sep 2020 12:38:11 +0200 Subject: [PATCH 10/22] add a ci for doctests (#4437) * add a ci for doctests * rename the step --- azure-pipelines.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 8061c9895ca..74cb13dc985 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -74,6 +74,18 @@ jobs: - bash: black --check . displayName: black formatting check +- job: Doctests + variables: + conda_env: py38 + pool: + vmImage: 'ubuntu-16.04' + steps: + - template: ci/azure/install.yml + - bash: | + source activate xarray-tests + python -m pytest --doctest-modules xarray --ignore xarray/tests + displayName: Run doctests + - job: TypeChecking variables: conda_env: py38 From 894b26e7061087a5b7a814d80007bc6fea20a5c5 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 19 Sep 2020 18:34:43 +0000 Subject: [PATCH 11/22] Fix doctests (#4439) --- xarray/core/combine.py | 2 +- xarray/core/groupby.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index d9ce3def673..5b3a8bef6a5 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -711,8 +711,8 @@ def combine_by_coords( Dimensions: (x: 3, y: 4) Coordinates: - * x (x) int64 10 20 30 * y (y) int64 0 1 2 3 + * x (x) int64 10 20 30 Data variables: temperature (y, x) float64 10.98 14.3 12.06 10.9 ... 1.743 0.4044 16.65 precipitation (y, x) float64 0.4376 0.8918 0.9637 ... 0.7992 0.4615 0.7805 diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 5f328d7a03a..9cdd86e8122 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -607,8 +607,8 @@ def quantile( array([[0.7, 4.2, 0.7, 1.5], [6.5, 7.3, 2.6, 1.9]]) Coordinates: - quantile float64 0.0 * y (y) int64 1 1 2 2 + quantile float64 0.0 * x (x) int64 0 1 >>> ds.groupby("y").quantile(0, dim=...) @@ -630,8 +630,8 @@ def quantile( [2.6 , 2.6 , 2.6 ], [1.9 , 1.9 , 1.9 ]]]) Coordinates: - * quantile (quantile) float64 0.0 0.5 1.0 * y (y) int64 1 1 2 2 + * quantile (quantile) float64 0.0 0.5 1.0 * x (x) int64 0 1 >>> ds.groupby("y").quantile([0, 0.5, 1], dim=...) From cc4df343db46f802acb985e2f88256c061d49153 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 19 Sep 2020 11:35:24 -0700 Subject: [PATCH 12/22] Small updates to How-to-release + lint (#4436) * Lint markdown * _ * _ --- HOW_TO_RELEASE.md | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/HOW_TO_RELEASE.md b/HOW_TO_RELEASE.md index ec0cca59545..6521cdcd82a 100644 --- a/HOW_TO_RELEASE.md +++ b/HOW_TO_RELEASE.md @@ -3,18 +3,21 @@ Time required: about an hour. These instructions assume that `upstream` refers to the main repository: -``` + +```sh $ git remote -v {...} upstream https://github.com/pydata/xarray (fetch) upstream https://github.com/pydata/xarray (push) ``` + + 1. Ensure your master branch is synced to upstream: ```sh git pull upstream master ``` - 2. Get a list of contributors with: + 2. Add a list of contributors with: ```sh git log "$(git tag --sort="v:refname" | sed -n 'x;$p').." --format=%aN | sort -u | perl -pe 's/\n/$1, /' ``` @@ -22,9 +25,12 @@ upstream https://github.com/pydata/xarray (push) ```sh git log v{0.X.Y-1}.. --format=%aN | sort -u | perl -pe 's/\n/$1, /' ``` - Add these into `whats-new.rst` somewhere :) + This will return the number of contributors: + ```sh + git log v{0.X.Y-1}.. --format=%aN | sort -u | wc -l + ``` 3. Write a release summary: ~50 words describing the high level features. This - will be used in the release emails, tweets, GitHub release notes, etc. + will be used in the release emails, tweets, GitHub release notes, etc. 4. Look over whats-new.rst and the docs. Make sure "What's New" is complete (check the date!) and add the release summary at the top. Things to watch out for: @@ -45,7 +51,7 @@ upstream https://github.com/pydata/xarray (push) ``` 8. Check that the ReadTheDocs build is passing. 9. On the master branch, commit the release in git: - ```s + ```sh git commit -am 'Release v{0.X.Y}' ``` 10. Tag the release: @@ -67,7 +73,7 @@ upstream https://github.com/pydata/xarray (push) twine upload dist/xarray-{0.X.Y}* ``` You will need to be listed as a package owner at - https://pypi.python.org/pypi/xarray for this to work. + for this to work. 14. Push your changes to master: ```sh git push upstream master @@ -80,11 +86,11 @@ upstream https://github.com/pydata/xarray (push) git push --force upstream stable git checkout master ``` - It's OK to force push to 'stable' if necessary. (We also update the stable - branch with `git cherry-pick` for documentation only fixes that apply the + It's OK to force push to 'stable' if necessary. (We also update the stable + branch with `git cherry-pick` for documentation only fixes that apply the current released version.) 16. Add a section for the next release {0.X.Y+1} to doc/whats-new.rst: - ``` + ```rst .. _whats-new.{0.X.Y+1}: v{0.X.Y+1} (unreleased) @@ -116,12 +122,12 @@ upstream https://github.com/pydata/xarray (push) ``` You're done pushing to master! 18. Issue the release on GitHub. Click on "Draft a new release" at - https://github.com/pydata/xarray/releases. Type in the version number + . Type in the version number and paste the release summary in the notes. -19. Update the docs. Login to https://readthedocs.org/projects/xray/versions/ +19. Update the docs. Login to and switch your new release tag (at the bottom) from "Inactive" to "Active". It should now build automatically. -20. Issue the release announcement to mailing lists & Twitter. For bug fix releases, I +20. Issue the release announcement to mailing lists & Twitter. For bug fix releases, I usually only email xarray@googlegroups.com. For major/feature releases, I will email a broader list (no more than once every 3-6 months): - pydata@googlegroups.com @@ -133,6 +139,8 @@ upstream https://github.com/pydata/xarray (push) Google search will turn up examples of prior release announcements (look for "ANN xarray"). + + ## Note on version numbering We follow a rough approximation of semantic version. Only major releases (0.X.0) From 0c26211566d620b2f81dd79c15f8afcc37faacbc Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 19 Sep 2020 17:06:40 -0700 Subject: [PATCH 13/22] Release notes for 0.16.1 (#4435) * Release notes for 0.16.1 * Update doc/whats-new.rst Co-authored-by: Deepak Cherian * Update doc/whats-new.rst Co-authored-by: keewis * Update doc/whats-new.rst Co-authored-by: Deepak Cherian * Update doc/whats-new.rst * _ Co-authored-by: Deepak Cherian Co-authored-by: keewis --- doc/whats-new.rst | 81 ++++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 78e49f711e1..4c0658f6972 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -16,58 +16,73 @@ What's New .. _whats-new.0.16.1: -v0.16.1 (unreleased) +v0.16.1 (2020-09-20) --------------------- +This patch release fixes an incompatability with a recent pandas change, which +was causing an issue indexing with a ``datetime64``. It also includes +improvements to ``rolling``, ``to_dataframe``, ``cov`` & ``corr`` methods and +bug fixes. + +Many thanks to the 36 contributors who contributed to this release: + +Aaron Spring, Akio Taniguchi, Aleksandar Jelenak, Alexandre Poux, +Caleb, Dan Nowacki, Deepak Cherian, Gerardo Rivera, Jacob Tomlinson, James A. +Bednar, Joe Hamman, Julia Kent, Kai Mühlbauer, Keisuke Fujii, Mathias Hauser, +Maximilian Roos, Nick R. Papior, Pascal Bourgault, Peter Hausamann, Romain +Martinez, Russell Manser, Samnan Rahee, Sander, Spencer Clark, Stephan Hoyer, +Thomas Zilio, Tobias Kölling, Tom Augspurger, alexamici, crusaderky, darikg, +inakleinbottle, jenssss, johnomotani, keewis, rpgoldman + Breaking changes ~~~~~~~~~~~~~~~~ + - :py:meth:`DataArray.astype` and :py:meth:`Dataset.astype` now preserve attributes. Keep the old behavior by passing `keep_attrs=False` (:issue:`2049`, :pull:`4314`). By `Dan Nowacki `_ and `Gabriel Joel Mitchell `_. New Features ~~~~~~~~~~~~ -- Support multiple outputs in :py:func:`xarray.apply_ufunc` when using ``dask='parallelized'``. (:issue:`1815`, :pull:`4060`) - By `Kai Mühlbauer `_. + - :py:meth:`~xarray.DataArray.rolling` and :py:meth:`~xarray.Dataset.rolling` now accept more than 1 dimension. (:pull:`4219`) By `Keisuke Fujii `_. +- :py:meth:`~xarray.DataArray.to_dataframe` and :py:meth:`~xarray.Dataset.to_dataframe` + now accept a ``dim_order`` parameter allowing to specify the resulting dataframe's + dimensions order (:issue:`4331`, :pull:`4333`). + By `Thomas Zilio `_. +- Support multiple outputs in :py:func:`xarray.apply_ufunc` when using + ``dask='parallelized'``. (:issue:`1815`, :pull:`4060`). + By `Kai Mühlbauer `_. - ``min_count`` can be supplied to reductions such as ``.sum`` when specifying - multiple dimension to reduce over. (:pull:`4356`) + multiple dimension to reduce over; (:pull:`4356`). By `Maximilian Roos `_. -- :py:func:`xarray.cov` and :py:func:`xarray.corr` now handle missing values. (:pull:`4351`) +- :py:func:`xarray.cov` and :py:func:`xarray.corr` now handle missing values; (:pull:`4351`). By `Maximilian Roos `_. +- Add support for parsing datetime strings formatted following the default + string representation of cftime objects, i.e. YYYY-MM-DD hh:mm:ss, in + partial datetime string indexing, as well as :py:meth:`~xarray.cftime_range` + (:issue:`4337`). By `Spencer Clark `_. - Build ``CFTimeIndex.__repr__`` explicitly as :py:class:`pandas.Index`. Add ``calendar`` as a new property for :py:class:`CFTimeIndex` and show ``calendar`` and ``length`` in ``CFTimeIndex.__repr__`` (:issue:`2416`, :pull:`4092`) By `Aaron Spring `_. -- Relaxed the :ref:`mindeps_policy` to support: - - - all versions of setuptools released in the last 42 months (but no older than 38.4) - - all versions of dask and dask.distributed released in the last 12 months (but no - older than 2.9) - - all versions of other packages released in the last 12 months - - All are up from 6 months (:issue:`4295`) - `Guido Imperiale `_. - Use a wrapped array's ``_repr_inline_`` method to construct the collapsed ``repr`` of :py:class:`DataArray` and :py:class:`Dataset` objects and document the new method in :doc:`internals`. (:pull:`4248`). By `Justus Magin `_. -- Add support for parsing datetime strings formatted following the default - string representation of cftime objects, i.e. YYYY-MM-DD hh:mm:ss, in - partial datetime string indexing, as well as :py:meth:`~xarray.cftime_range` - (:issue:`4337`). By `Spencer Clark `_. -- :py:meth:`~xarray.DataArray.to_dataframe` and :py:meth:`~xarray.Dataset.to_dataframe` - now accept a ``dim_order`` parameter allowing to specify the resulting dataframe's - dimensions order (:issue:`4331`, :pull:`4333`). - By `Thomas Zilio `_. +- Allow per-variable fill values in most functions. (:pull:`4237`). + By `Justus Magin `_. - Expose ``use_cftime`` option in :py:func:`~xarray.open_zarr` (:issue:`2886`, :pull:`3229`) By `Samnan Rahee `_ and `Anderson Banihirwe `_. Bug fixes ~~~~~~~~~ + +- Fix indexing with datetime64 scalars with pandas 1.1 (:issue:`4283`). + By `Stephan Hoyer `_ and + `Justus Magin `_. - Variables which are chunked using dask only along some dimensions can be chunked while storing with zarr along previously unchunked dimensions (:pull:`4312`) By `Tobias Kölling `_. - Fixed a bug in backend caused by basic installation of Dask (:issue:`4164`, :pull:`4318`) @@ -86,13 +101,14 @@ Bug fixes By `Jens Svensmark `_ - Fix incorrect legend labels for :py:meth:`Dataset.plot.scatter` (:issue:`4126`). By `Peter Hausamann `_. +- Fix ``pip install .`` when no ``.git`` directory exists; namely when the xarray source + directory has been rsync'ed by PyCharm Professional for a remote deployment over SSH. + By `Guido Imperiale `_ - Preserve dimension and coordinate order during :py:func:`xarray.concat` (:issue:`2811`, :issue:`4072`, :pull:`4419`). By `Kai Mühlbauer `_. - Avoid relying on :py:class:`set` objects for the ordering of the coordinates (:pull:`4409`) By `Justus Magin `_. -- Fix indexing with datetime64 scalars with pandas 1.1 (:issue:`4283`). - By `Stephan Hoyer `_ and - `Justus Magin `_. + Documentation ~~~~~~~~~~~~~ @@ -102,7 +118,7 @@ Documentation - Removed skipna argument from :py:meth:`DataArray.count`, :py:meth:`DataArray.any`, :py:meth:`DataArray.all`. (:issue:`755`) By `Sander van Rijn `_ - Update the contributing guide to use merges instead of rebasing and state - that we squash-merge. (:pull:`4355`) By `Justus Magin `_. + that we squash-merge. (:pull:`4355`). By `Justus Magin `_. - Make sure the examples from the docstrings actually work (:pull:`4408`). By `Justus Magin `_. - Updated Vectorized Indexing to a clearer example. @@ -110,11 +126,18 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ + +- Relaxed the :ref:`mindeps_policy` to support: + + - all versions of setuptools released in the last 42 months (but no older than 38.4) + - all versions of dask and dask.distributed released in the last 12 months (but no + older than 2.9) + - all versions of other packages released in the last 12 months + + All are up from 6 months (:issue:`4295`) + `Guido Imperiale `_. - Use :py:func:`dask.array.apply_gufunc` instead of :py:func:`dask.array.blockwise` in :py:func:`xarray.apply_ufunc` when using ``dask='parallelized'``. (:pull:`4060`, :pull:`4391`, :pull:`4392`) -- Fix ``pip install .`` when no ``.git`` directory exists; namely when the xarray source - directory has been rsync'ed by PyCharm Professional for a remote deployment over SSH. - By `Guido Imperiale `_ - Align ``mypy`` versions to ``0.782`` across ``requirements`` and ``.pre-commit-config.yml`` files. (:pull:`4390`) By `Maximilian Roos `_ From 13c09dc28ec8ff791c6d87e2d8e80c362c65ffd4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 20 Sep 2020 00:21:56 -0500 Subject: [PATCH 14/22] Fixed dask.optimize on datasets (#4438) * Fixed dask.optimize on datasets Another attempt to fix #3698. The issue with my fix in is that we hit `Variable._dask_finalize` in both `dask.optimize` and `dask.persist`. We want to do the culling of unnecessary tasks (`test_persist_Dataset`) but only in the persist case, not optimize (`test_optimize`). * Update whats-new.rst * Update doc/whats-new.rst Co-authored-by: Deepak Cherian Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- doc/whats-new.rst | 3 ++- xarray/core/dataset.py | 11 ++++++++++- xarray/core/variable.py | 3 --- xarray/tests/test_dask.py | 8 ++++++++ 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4c0658f6972..82f51a1beec 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -101,6 +101,8 @@ Bug fixes By `Jens Svensmark `_ - Fix incorrect legend labels for :py:meth:`Dataset.plot.scatter` (:issue:`4126`). By `Peter Hausamann `_. +- Fix ``dask.optimize`` on ``DataArray`` producing an invalid Dask task graph (:issue:`3698`) + By `Tom Augspurger `_ - Fix ``pip install .`` when no ``.git`` directory exists; namely when the xarray source directory has been rsync'ed by PyCharm Professional for a remote deployment over SSH. By `Guido Imperiale `_ @@ -109,7 +111,6 @@ Bug fixes - Avoid relying on :py:class:`set` objects for the ordering of the coordinates (:pull:`4409`) By `Justus Magin `_. - Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ce72d4a5886..1777ee356af 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -777,10 +777,19 @@ def _dask_postcompute(results, info, *args): @staticmethod def _dask_postpersist(dsk, info, *args): variables = {} + # postpersist is called in both dask.optimize and dask.persist + # When persisting, we want to filter out unrelated keys for + # each Variable's task graph. + is_persist = len(dsk) == len(info) for is_dask, k, v in info: if is_dask: func, args2 = v - result = func(dsk, *args2) + if is_persist: + name = args2[1][0] + dsk2 = {k: v for k, v in dsk.items() if k[0] == name} + else: + dsk2 = dsk + result = func(dsk2, *args2) else: result = v variables[k] = result diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 6de00ee882a..c55e61cb816 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -501,9 +501,6 @@ def __dask_postpersist__(self): @staticmethod def _dask_finalize(results, array_func, array_args, dims, attrs, encoding): - if isinstance(results, dict): # persist case - name = array_args[0] - results = {k: v for k, v in results.items() if k[0] == name} data = array_func(results, *array_args) return Variable(dims, data, attrs=attrs, encoding=encoding) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 46685a29a47..7d664aca3e4 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1607,3 +1607,11 @@ def test_more_transforms_pass_lazy_array_equiv(map_da, map_ds): assert_equal(map_da._from_temp_dataset(map_da._to_temp_dataset()), map_da) assert_equal(map_da.astype(map_da.dtype), map_da) assert_equal(map_da.transpose("y", "x", transpose_coords=False).cxy, map_da.cxy) + + +def test_optimize(): + # https://github.com/pydata/xarray/issues/3698 + a = dask.array.ones((10, 4), chunks=(5, 2)) + arr = xr.DataArray(a).chunk(5) + (arr2,) = dask.optimize(arr) + arr2.compute() From 633187e32be3559ee4989b5049449d51372a1178 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sun, 20 Sep 2020 12:33:35 -0700 Subject: [PATCH 15/22] Add notes re doctests (#4440) --- doc/whats-new.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 82f51a1beec..0ff11048901 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,7 +22,8 @@ v0.16.1 (2020-09-20) This patch release fixes an incompatability with a recent pandas change, which was causing an issue indexing with a ``datetime64``. It also includes improvements to ``rolling``, ``to_dataframe``, ``cov`` & ``corr`` methods and -bug fixes. +bug fixes. Our documentation has a number of improvements, including fixing all +doctests and confirming their accuracy on every commit. Many thanks to the 36 contributors who contributed to this release: @@ -32,7 +33,7 @@ Bednar, Joe Hamman, Julia Kent, Kai Mühlbauer, Keisuke Fujii, Mathias Hauser, Maximilian Roos, Nick R. Papior, Pascal Bourgault, Peter Hausamann, Romain Martinez, Russell Manser, Samnan Rahee, Sander, Spencer Clark, Stephan Hoyer, Thomas Zilio, Tobias Kölling, Tom Augspurger, alexamici, crusaderky, darikg, -inakleinbottle, jenssss, johnomotani, keewis, rpgoldman +inakleinbottle, jenssss, johnomotani, keewis, and rpgoldman. Breaking changes ~~~~~~~~~~~~~~~~ @@ -128,6 +129,8 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Fixed all doctests and enabled their running in CI. + By `Justus Magin `_. - Relaxed the :ref:`mindeps_policy` to support: - all versions of setuptools released in the last 42 months (but no older than 38.4) From cb6f59e6bcfe11656d68c39a7a5c4cc2730ac98d Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sun, 20 Sep 2020 12:40:50 -0700 Subject: [PATCH 16/22] New whatsnew section --- doc/whats-new.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0ff11048901..b4f34161abb 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,6 +14,32 @@ What's New np.random.seed(123456) + +.. _whats-new.{0.16.2}: + +v{0.16.2} (unreleased) +--------------------- + +Breaking changes +~~~~~~~~~~~~~~~~ + + +New Features +~~~~~~~~~~~~ + + +Bug fixes +~~~~~~~~~ + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ + + .. _whats-new.0.16.1: v0.16.1 (2020-09-20) From 788cd605d8270c6aeb6435eb0760eb5f2d97228a Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sun, 20 Sep 2020 12:42:49 -0700 Subject: [PATCH 17/22] Fix release notes typo --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b4f34161abb..395a37ac100 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -45,7 +45,7 @@ Internal Changes v0.16.1 (2020-09-20) --------------------- -This patch release fixes an incompatability with a recent pandas change, which +This patch release fixes an incompatibility with a recent pandas change, which was causing an issue indexing with a ``datetime64``. It also includes improvements to ``rolling``, ``to_dataframe``, ``cov`` & ``corr`` methods and bug fixes. Our documentation has a number of improvements, including fixing all From b637c876f61314ae36fb7d014d6978de463079da Mon Sep 17 00:00:00 2001 From: Luke Volpatti Date: Thu, 25 Jun 2020 15:37:55 -0400 Subject: [PATCH 18/22] Fix typo (#4181) --- doc/quick-overview.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/quick-overview.rst b/doc/quick-overview.rst index 09b0d4c6fbb..e3d1456f017 100644 --- a/doc/quick-overview.rst +++ b/doc/quick-overview.rst @@ -46,7 +46,7 @@ Here are the key properties for a ``DataArray``: Indexing -------- -xarray supports four kind of indexing. Since we have assigned coordinate labels to the x dimension we can use label-based indexing along that dimension just like pandas. The four examples below all yield the same result (the value at `x=10`) but at varying levels of convenience and intuitiveness. +xarray supports four kinds of indexing. Since we have assigned coordinate labels to the x dimension we can use label-based indexing along that dimension just like pandas. The four examples below all yield the same result (the value at `x=10`) but at varying levels of convenience and intuitiveness. .. ipython:: python From 6912e80fb9cc38af51748bf8e4c68fd87c478f45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Wed, 22 Jul 2020 19:07:25 +0200 Subject: [PATCH 19/22] fix typo in io.rst (#4250) --- doc/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/io.rst b/doc/io.rst index 4aac5e0b6f7..956d9394653 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -26,7 +26,7 @@ The recommended way to store xarray data structures is `netCDF`__, which is a binary file format for self-described datasets that originated in the geosciences. xarray is based on the netCDF data model, so netCDF files on disk directly correspond to :py:class:`Dataset` objects (more accurately, -a group in a netCDF file directly corresponds to a to :py:class:`Dataset` object. +a group in a netCDF file directly corresponds to a :py:class:`Dataset` object. See :ref:`io.netcdf_groups` for more.) NetCDF is supported on almost all platforms, and parsers exist From 1155f5646e07100e4acda18db074b148f1213b5d Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sun, 20 Sep 2020 16:31:38 -0700 Subject: [PATCH 20/22] Fix release notes formatting (#4443) * Fix release notes rst * Update doc/whats-new.rst Co-authored-by: keewis Co-authored-by: keewis --- doc/whats-new.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 395a37ac100..36611555a7d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,10 +15,10 @@ What's New np.random.seed(123456) -.. _whats-new.{0.16.2}: +.. _whats-new.0.16.2: -v{0.16.2} (unreleased) ---------------------- +v0.16.2 (unreleased) +-------------------- Breaking changes ~~~~~~~~~~~~~~~~ From 5654aee927586c2dcbc3f34d674ed5c9646326c1 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Tue, 22 Sep 2020 17:40:30 +1200 Subject: [PATCH 21/22] Xarray open_mfdataset with engine Zarr (#4187) * create def for multiple zarr files and added commentary/definition, which matches almost exactly that of ``xr.open_mfdatasets``, but withou ``engine`` * just as with ``xr.open_mfdatasets``, identify the paths as local directory paths/strings * added error if no path * finished copying similar code from `xr.open_mfdatasets` * remove blank lines * fixed typo * added ``xr.open_mzarr()`` to the list of available modules to call * imported missing function * imported missing glob * imported function from backend.api * imported function to facilitate mzarr * correctly imported functions from core to mzarr * imported to use on open_mzarr * removed lock and autoclose since not taken by ``open_zarr`` * fixed typo * class is not needed since zarr stores don`t remain open * removed old behavior * set default * listed open_mzarr * removed unused imported function * imported Path - hadn`t before * remove unncessesary comments * modified comments * isorted zarr * isorted * erased open_mzarr. Added capability to open_dataset to open zarr files * removed imported but unused * comment to `zarr` engine * added chunking code from `open_zarr` * remove import `open_mzarr`` * removed `open_mzarr`` from top-level-function * missing return in nested function * moved outside of nested function, had touble with reading before assignement * added missing argument associated with zarr stores, onto the definition of open_dataset * isort zarr.py * removed blank lines, fixed typo on `chunks` * removed imported but unused * restored conditional for `auto` * removed imported but unused `dask.array` * added capabilities for file_or_obj to be a mutablemapper such as `fsspec.get_mapper`, and thus compatible with `intake-xarray` * moved to a different conditional since file_or_obj is a mutablemapping, not a str, path or AbstractDataStore * isort api.py * restored the option for when file_or_obk is a str, such as an url. * fixed relabel * update open_dataset for zarr files * remove open_zarr from tests, now open_dataset(engine=`zarr`) * remove extra file, and raise deprecating warning on open_zarr * added internal call to open_dataset from depricated open_zarr * defined engine=`zarr` * correct argument for open_dataset * pass arguments as backend_kwargs * pass backend_kwargs as argument * typo * set `overwrite_enconded_chunks as backend_kwargs * do not pass as backend, use for chunking * removed commented code * moved definitions to zarr backends * Ensure class functions have necessary variables Was missing some 'self' and other kwarg variables. Also linted using black. * Combine MutableMapping and Zarr engine condition As per https://github.com/pydata/xarray/pull/4003#discussion_r441978720. * Pop out overwrite_encoded_chunks after shallow copy backend_kwargs dict Don't pop the backend_kwargs dict as per https://github.com/pydata/xarray/pull/4003#discussion_r441979810, make a shallow copy of the backend_kwargs dictionary first. Also removed `overwrite_encoded_chunks` as a top level kwarg of `open_dataset`. Instead, pass it to `backend_kwargs` when using engine="zarr". * Fix some errors noticed by PEP8 * Reorganize code in backends api.py and actually test using engine zarr Merge at 1977ba16147f6c0dfaac8f9f720698b622a5acfd wasn't done very well. Reorganized the logic of the code to reduce the diff with xarray master, and ensure that the zarr backend tests actually have engine="zarr" in them. * Add back missing decode_timedelta kwarg * Add back a missing engine="zarr" to test_distributed.py * Ensure conditional statements make sense * Fix UnboundLocalError on 'chunks' referenced before assignment Need to pass in chunks to maybe_decode_store, to resolve UnboundLocalError: local variable 'chunks' referenced before assignment. * Run isort to fix import order * Fix tests where kwargs needs to be inside of backend_kwargs dict now Also temporarily silence deprecate_auto_chunk tests using pytest.raises(TypeError). May remove those fully later. * Change open_zarr to open_dataset with engine="zarr" in io.rst * Fix test_distributed by wrapping consolidated in backend_kwargs dict Patches cb6d06606a9f5a9418da57006c8e976d3d362def. * Ensure read-only mode when using open_dataset with engine="zarr" * Turn chunks from "auto" to None if dask is not available * Add back a missing else statement in maybe_chunk * Allow xfail test_vectorized_indexing when has_dask Instead of when not has_dask. * Typo on chunks arg in open_dataset * Fix ZeroDivisionError by adding back check that chunks is not False Yet another if-statement that wasn't properly transferred from zarr.py to api.py. * Fix a typo that was causing TypeError: 'method' object is not iterable * Move the `if not chunks` block to after auto detect Patches logic of 6fbeadf41a1a547383da0c8f4499c99099dbdf97 to fix errors when Dask is not installed. * Revert "Allow xfail test_vectorized_indexing when has_dask" This reverts commit aca2012fb5f46e839c980781b50e8bf8b0562ed0. * Temporarily xfail test_vectorized_indexing with or without dask * Put zarr in open_mfdataset engine list * Test open_mfdataset_manyfiles with engine zarr Zarr objects are folders with seem to cause issues with closing, so added a try-except to api.py to catch failures in f.close(). Some tests failing when chunks=None because a numpy array is returned instead of a dask array. * Remember to set a ._file_obj when using Zarr Yet another logic error fixed, resolves the try-except hack in b9a239eff23378015896191c5ad237733a4795bd. * Expect np.ndarray when using open_mfdataset on Zarr with chunks None * Add an entry to what's new for open_mfdataset with Zarr engine Plus a small formatting fix in open_mfdataset docstring * Make zarr engine's custom chunk mechanism more in line with ds.chunk Slightly edited the token name string to start with 'xarray' and include chunks in tokenize. Also replace the deprecated `_replace_vars_and_dims` method with just `_replace`. * Workaround problem where dask arrays aren't returned when chunks is None Revert 827e546155a157f64dfe1585bf09ad733bc52543 and workaround to get dask arrays by fixing some if-then logic in the code when `engine="zarr"` is involved. Things work fine when using chunks="auto", perhaps because the try `import dask.array` is needed to trigger loading into dask arrays? Also removed using chunks="auto" in some Zarr tests to simplify. * Default to chunks="auto" for Zarr tests to fix test_vectorized_indexing Revert hack in 6b99225fc17fe7c51423b30c66914709e5239a05 as test_vectorized_indexing now works on dask, specifically the negative slices test. It will still fail without dask, as was the behaviour before. Solution was to set `chunks="auto"` as the default when testing using `open_dataset` with `engine="zarr"`, similar to the default for `open_zarr`. Reverted some aspects of dce4e7cd1fcf35fb7d3293bf6cc410646b588c64 to ensure this `chunks="auto"`setting is visible throughout the Zarr test suite. * Fix test by passing in chunk_store to backend_kwargs * Revert "Change open_zarr to open_dataset with engine="zarr" in io.rst" This reverts commit cd0b9efe5dd573b4234493a1c491dc11b13574cf. * Remove open_zarr DeprecationWarning Partially reverts b488363b32705e6bd0b174b927cb129d247f5d69. * Update open_dataset docstring to specify chunk options for zarr engine * Let only chunks = None return non-chunked arrays * Remove for-loop in test_manual_chunk since testing only one no_chunk * Update open_dataset docstring to remove mention of chunks=None with Zarr Co-authored-by: Miguel Jimenez-Urias Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 3 + xarray/backends/api.py | 74 +++++++++++-- xarray/backends/zarr.py | 183 +++++++++++-------------------- xarray/tests/test_backends.py | 56 ++++++---- xarray/tests/test_distributed.py | 8 +- 5 files changed, 170 insertions(+), 154 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 36611555a7d..5ee67efb1da 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,6 +27,9 @@ Breaking changes New Features ~~~~~~~~~~~~ +- :py:func:`open_dataset` and :py:func:`open_mfdataset` + now works with ``engine="zarr"`` (:issue:`3668`, :pull:`4003`, :pull:`4187`). + By `Miguel Jimenez `_ and `Wei Ji Leong `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9f45474e7e7..cd1ee88f504 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1,5 +1,6 @@ import os.path import warnings +from collections.abc import MutableMapping from glob import glob from io import BytesIO from numbers import Number @@ -344,14 +345,16 @@ def open_dataset( If True, decode the 'coordinates' attribute to identify coordinates in the resulting dataset. engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ - "pseudonetcdf"}, optional + "pseudonetcdf", "zarr"}, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for "netcdf4". chunks : int or dict, optional - If chunks is provided, it used to load the new dataset into dask + If chunks is provided, it is used to load the new dataset into dask arrays. ``chunks={}`` loads the dataset with dask using a single - chunk for all arrays. + chunk for all arrays. When using ``engine="zarr"`, setting + `chunks='auto'` will create dask chunks based on the variable's zarr + chunks. lock : False or lock-like, optional Resource lock to use when reading data from disk. Only relevant when using dask or another form of parallelism. By default, appropriate @@ -413,6 +416,7 @@ def open_dataset( "pynio", "cfgrib", "pseudonetcdf", + "zarr", ] if engine not in engines: raise ValueError( @@ -447,7 +451,7 @@ def open_dataset( if backend_kwargs is None: backend_kwargs = {} - def maybe_decode_store(store, lock=False): + def maybe_decode_store(store, chunks, lock=False): ds = conventions.decode_cf( store, mask_and_scale=mask_and_scale, @@ -461,7 +465,7 @@ def maybe_decode_store(store, lock=False): _protect_dataset_variables_inplace(ds, cache) - if chunks is not None: + if chunks is not None and engine != "zarr": from dask.base import tokenize # if passed an actual file path, augment the token with @@ -487,10 +491,40 @@ def maybe_decode_store(store, lock=False): ) name_prefix = "open_dataset-%s" % token ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) - ds2._file_obj = ds._file_obj + + elif engine == "zarr": + # adapted from Dataset.Chunk() and taken from open_zarr + if not (isinstance(chunks, (int, dict)) or chunks is None): + if chunks != "auto": + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) + + if chunks == "auto": + try: + import dask.array # noqa + except ImportError: + chunks = None + + # auto chunking needs to be here and not in ZarrStore because + # the variable chunks does not survive decode_cf + # return trivial case + if chunks is None: + return ds + + if isinstance(chunks, int): + chunks = dict.fromkeys(ds.dims, chunks) + + variables = { + k: store.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) + for k, v in ds.variables.items() + } + ds2 = ds._replace(variables) + else: ds2 = ds - + ds2._file_obj = ds._file_obj return ds2 if isinstance(filename_or_obj, Path): @@ -499,6 +533,17 @@ def maybe_decode_store(store, lock=False): if isinstance(filename_or_obj, AbstractDataStore): store = filename_or_obj + elif isinstance(filename_or_obj, MutableMapping) and engine == "zarr": + # Zarr supports a wide range of access modes, but for now xarray either + # reads or writes from a store, never both. + # For open_dataset(engine="zarr"), we only read (i.e. mode="r") + mode = "r" + _backend_kwargs = backend_kwargs.copy() + overwrite_encoded_chunks = _backend_kwargs.pop("overwrite_encoded_chunks", None) + store = backends.ZarrStore.open_group( + filename_or_obj, mode=mode, group=group, **_backend_kwargs + ) + elif isinstance(filename_or_obj, str): filename_or_obj = _normalize_path(filename_or_obj) @@ -526,7 +571,16 @@ def maybe_decode_store(store, lock=False): store = backends.CfGribDataStore( filename_or_obj, lock=lock, **backend_kwargs ) - + elif engine == "zarr": + # on ZarrStore, mode='r', synchronizer=None, group=None, + # consolidated=False. + _backend_kwargs = backend_kwargs.copy() + overwrite_encoded_chunks = _backend_kwargs.pop( + "overwrite_encoded_chunks", None + ) + store = backends.ZarrStore.open_group( + filename_or_obj, group=group, **_backend_kwargs + ) else: if engine not in [None, "scipy", "h5netcdf"]: raise ValueError( @@ -542,7 +596,7 @@ def maybe_decode_store(store, lock=False): ) with close_on_error(store): - ds = maybe_decode_store(store) + ds = maybe_decode_store(store, chunks) # Ensure source filename always stored in dataset object (GH issue #2550) if "source" not in ds.encoding: @@ -794,7 +848,7 @@ def open_mfdataset( If provided, call this function on each dataset prior to concatenation. You can find the file-name from which each dataset was loaded in ``ds.encoding["source"]``. - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib"}, \ + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", "zarr"}, \ optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f74fddb694e..2651f3148fd 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -7,6 +7,7 @@ from ..core.pycompat import integer_types from ..core.utils import FrozenDict, HiddenKeyDict from ..core.variable import Variable +from .api import open_dataset from .common import AbstractWritableDataStore, BackendArray, _encode_variable_name # need some special secret attributes to tell us the dimensions @@ -361,6 +362,51 @@ def encode_variable(self, variable): def encode_attribute(self, a): return encode_zarr_attr_value(a) + def get_chunk(self, name, var, chunks): + chunk_spec = dict(zip(var.dims, var.encoding.get("chunks"))) + + # Coordinate labels aren't chunked + if var.ndim == 1 and var.dims[0] == name: + return chunk_spec + + if chunks == "auto": + return chunk_spec + + for dim in var.dims: + if dim in chunks: + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if isinstance(spec, (tuple, list)) and chunk_spec[dim]: + if any(s % chunk_spec[dim] for s in spec): + warnings.warn( + "Specified Dask chunks %r would " + "separate Zarr chunk shape %r for " + "dimension %r. This significantly " + "degrades performance. Consider " + "rechunking after loading instead." + % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2, + ) + chunk_spec[dim] = chunks[dim] + return chunk_spec + + def maybe_chunk(self, name, var, chunks, overwrite_encoded_chunks): + chunk_spec = self.get_chunk(name, var, chunks) + + if (var.ndim > 0) and (chunk_spec is not None): + from dask.base import tokenize + + # does this cause any data to be read? + token2 = tokenize(name, var._data, chunks) + name2 = f"xarray-{name}-{token2}" + var = var.chunk(chunk_spec, name=name2, lock=None) + if overwrite_encoded_chunks and var.chunks is not None: + var.encoding["chunks"] = tuple(x[0] for x in var.chunks) + return var + else: + return var + def store( self, variables, @@ -601,130 +647,33 @@ def open_zarr( ---------- http://zarr.readthedocs.io/ """ - if "auto_chunk" in kwargs: - auto_chunk = kwargs.pop("auto_chunk") - if auto_chunk: - chunks = "auto" # maintain backwards compatibility - else: - chunks = None - - warnings.warn( - "auto_chunk is deprecated. Use chunks='auto' instead.", - FutureWarning, - stacklevel=2, - ) if kwargs: raise TypeError( "open_zarr() got unexpected keyword arguments " + ",".join(kwargs.keys()) ) - if not isinstance(chunks, (int, dict)): - if chunks != "auto" and chunks is not None: - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - if chunks == "auto": - try: - import dask.array # noqa - except ImportError: - chunks = None - - if not decode_cf: - mask_and_scale = False - decode_times = False - concat_characters = False - decode_coords = False - decode_timedelta = False - - def maybe_decode_store(store, lock=False): - ds = conventions.decode_cf( - store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - decode_timedelta=decode_timedelta, - use_cftime=use_cftime, - ) + backend_kwargs = { + "synchronizer": synchronizer, + "consolidated": consolidated, + "overwrite_encoded_chunks": overwrite_encoded_chunks, + "chunk_store": chunk_store, + } - # TODO: this is where we would apply caching - - return ds - - # Zarr supports a wide range of access modes, but for now xarray either - # reads or writes from a store, never both. For open_zarr, we only read - mode = "r" - zarr_store = ZarrStore.open_group( - store, - mode=mode, - synchronizer=synchronizer, + ds = open_dataset( + filename_or_obj=store, group=group, - consolidated=consolidated, - chunk_store=chunk_store, + decode_cf=decode_cf, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + engine="zarr", + chunks=chunks, + drop_variables=drop_variables, + backend_kwargs=backend_kwargs, + decode_timedelta=decode_timedelta, + use_cftime=use_cftime, ) - ds = maybe_decode_store(zarr_store) - - # auto chunking needs to be here and not in ZarrStore because variable - # chunks do not survive decode_cf - # return trivial case - if not chunks: - return ds - - # adapted from Dataset.Chunk() - if isinstance(chunks, int): - chunks = dict.fromkeys(ds.dims, chunks) - - if isinstance(chunks, tuple) and len(chunks) == len(ds.dims): - chunks = dict(zip(ds.dims, chunks)) - - def get_chunk(name, var, chunks): - chunk_spec = dict(zip(var.dims, var.encoding.get("chunks"))) - - # Coordinate labels aren't chunked - if var.ndim == 1 and var.dims[0] == name: - return chunk_spec - - if chunks == "auto": - return chunk_spec - - for dim in var.dims: - if dim in chunks: - spec = chunks[dim] - if isinstance(spec, int): - spec = (spec,) - if isinstance(spec, (tuple, list)) and chunk_spec[dim]: - if any(s % chunk_spec[dim] for s in spec): - warnings.warn( - "Specified Dask chunks %r would " - "separate Zarr chunk shape %r for " - "dimension %r. This significantly " - "degrades performance. Consider " - "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim), - stacklevel=2, - ) - chunk_spec[dim] = chunks[dim] - return chunk_spec - - def maybe_chunk(name, var, chunks): - from dask.base import tokenize - - chunk_spec = get_chunk(name, var, chunks) - - if (var.ndim > 0) and (chunk_spec is not None): - # does this cause any data to be read? - token2 = tokenize(name, var._data) - name2 = "zarr-%s" % token2 - var = var.chunk(chunk_spec, name=name2, lock=None) - if overwrite_encoded_chunks and var.chunks is not None: - var.encoding["chunks"] = tuple(x[0] for x in var.chunks) - return var - else: - return var - variables = {k: maybe_chunk(k, v, chunks) for k, v in ds.variables.items()} - return ds._replace_vars_and_dims(variables) + return ds diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 33ac26cfd39..f9cc802f2c8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1555,7 +1555,7 @@ def save(self, dataset, store_target, **kwargs): @contextlib.contextmanager def open(self, store_target, **kwargs): - with xr.open_zarr(store_target, **kwargs) as ds: + with xr.open_dataset(store_target, engine="zarr", **kwargs) as ds: yield ds @contextlib.contextmanager @@ -1565,7 +1565,7 @@ def roundtrip( if save_kwargs is None: save_kwargs = {} if open_kwargs is None: - open_kwargs = {} + open_kwargs = {"chunks": "auto"} with self.create_zarr_target() as store_target: self.save(data, store_target, **save_kwargs) with self.open(store_target, **open_kwargs) as ds: @@ -1577,7 +1577,7 @@ def test_roundtrip_consolidated(self): with self.roundtrip( expected, save_kwargs={"consolidated": True}, - open_kwargs={"consolidated": True}, + open_kwargs={"backend_kwargs": {"consolidated": True}}, ) as actual: self.check_dtypes_roundtripped(expected, actual) assert_identical(expected, actual) @@ -1587,7 +1587,7 @@ def test_with_chunkstore(self): with self.create_zarr_target() as store_target, self.create_zarr_target() as chunk_store: save_kwargs = {"chunk_store": chunk_store} self.save(expected, store_target, **save_kwargs) - open_kwargs = {"chunk_store": chunk_store} + open_kwargs = {"backend_kwargs": {"chunk_store": chunk_store}} with self.open(store_target, **open_kwargs) as ds: assert_equal(ds, expected) @@ -1614,16 +1614,14 @@ def test_auto_chunk(self): def test_manual_chunk(self): original = create_test_data().chunk({"dim1": 3, "dim2": 4, "dim3": 3}) - # All of these should return non-chunked arrays - NO_CHUNKS = (None, 0, {}) - for no_chunk in NO_CHUNKS: - open_kwargs = {"chunks": no_chunk} - with self.roundtrip(original, open_kwargs=open_kwargs) as actual: - for k, v in actual.variables.items(): - # only index variables should be in memory - assert v._in_memory == (k in actual.dims) - # there should be no chunks - assert v.chunks is None + # Using chunks = None should return non-chunked arrays + open_kwargs = {"chunks": None} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # there should be no chunks + assert v.chunks is None # uniform arrays for i in range(2, 6): @@ -1639,7 +1637,10 @@ def test_manual_chunk(self): chunks = {"dim1": 2, "dim2": 3, "dim3": 5} rechunked = original.chunk(chunks=chunks) - open_kwargs = {"chunks": chunks, "overwrite_encoded_chunks": True} + open_kwargs = { + "chunks": chunks, + "backend_kwargs": {"overwrite_encoded_chunks": True}, + } with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): assert v.chunks == rechunked[k].chunks @@ -1678,7 +1679,7 @@ def test_warning_on_bad_chunks(self): @requires_dask def test_deprecate_auto_chunk(self): original = create_test_data().chunk() - with pytest.warns(FutureWarning): + with pytest.raises(TypeError): with self.roundtrip(original, open_kwargs={"auto_chunk": True}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory @@ -1686,7 +1687,7 @@ def test_deprecate_auto_chunk(self): # chunk size should be the same as original assert v.chunks == original[k].chunks - with pytest.warns(FutureWarning): + with pytest.raises(TypeError): with self.roundtrip(original, open_kwargs={"auto_chunk": False}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory @@ -1847,7 +1848,9 @@ def test_write_persistence_modes(self, group): ds.to_zarr(store_target, mode="w", group=group) ds_to_append.to_zarr(store_target, append_dim="time", group=group) original = xr.concat([ds, ds_to_append], dim="time") - actual = xr.open_zarr(store_target, group=group) + actual = xr.open_dataset( + store_target, group=group, chunks="auto", engine="zarr" + ) assert_identical(original, actual) def test_compressor_encoding(self): @@ -1938,11 +1941,11 @@ def test_check_encoding_is_consistent_after_append(self): encoding = {"da": {"compressor": compressor}} ds.to_zarr(store_target, mode="w", encoding=encoding) ds_to_append.to_zarr(store_target, append_dim="time") - actual_ds = xr.open_zarr(store_target) + actual_ds = xr.open_dataset(store_target, chunks="auto", engine="zarr") actual_encoding = actual_ds["da"].encoding["compressor"] assert actual_encoding.get_config() == compressor.get_config() assert_identical( - xr.open_zarr(store_target).compute(), + xr.open_dataset(store_target, chunks="auto", engine="zarr").compute(), xr.concat([ds, ds_to_append], dim="time"), ) @@ -1957,7 +1960,9 @@ def test_append_with_new_variable(self): ds_with_new_var.to_zarr(store_target, mode="a") combined = xr.concat([ds, ds_to_append], dim="time") combined["new_var"] = ds_with_new_var["new_var"] - assert_identical(combined, xr.open_zarr(store_target)) + assert_identical( + combined, xr.open_dataset(store_target, chunks="auto", engine="zarr") + ) @requires_dask def test_to_zarr_compute_false_roundtrip(self): @@ -2567,7 +2572,7 @@ def test_write_inconsistent_chunks(self): assert actual["y"].encoding["chunksizes"] == (100, 50) -@pytest.fixture(params=["scipy", "netcdf4", "h5netcdf", "pynio"]) +@pytest.fixture(params=["scipy", "netcdf4", "h5netcdf", "pynio", "zarr"]) def readengine(request): return request.param @@ -2627,7 +2632,10 @@ def test_open_mfdataset_manyfiles( # split into multiple sets of temp files for ii in original.x.values: subds = original.isel(x=slice(ii, ii + 1)) - subds.to_netcdf(tmpfiles[ii], engine=writeengine) + if writeengine != "zarr": + subds.to_netcdf(tmpfiles[ii], engine=writeengine) + else: # if writeengine == "zarr": + subds.to_zarr(store=tmpfiles[ii]) # check that calculation on opened datasets works properly with open_mfdataset( @@ -2636,7 +2644,7 @@ def test_open_mfdataset_manyfiles( concat_dim="x", engine=readengine, parallel=parallel, - chunks=chunks, + chunks=chunks if (not chunks and readengine != "zarr") else "auto", ) as actual: # check that using open_mfdataset returns dask arrays for variables diff --git a/xarray/tests/test_distributed.py b/xarray/tests/test_distributed.py index 8011171d223..7886e9fd0d4 100644 --- a/xarray/tests/test_distributed.py +++ b/xarray/tests/test_distributed.py @@ -135,8 +135,8 @@ def test_dask_distributed_read_netcdf_integration_test( def test_dask_distributed_zarr_integration_test(loop, consolidated, compute): if consolidated: pytest.importorskip("zarr", minversion="2.2.1.dev2") - write_kwargs = dict(consolidated=True) - read_kwargs = dict(consolidated=True) + write_kwargs = {"consolidated": True} + read_kwargs = {"backend_kwargs": {"consolidated": True}} else: write_kwargs = read_kwargs = {} chunks = {"dim1": 4, "dim2": 3, "dim3": 5} @@ -151,7 +151,9 @@ def test_dask_distributed_zarr_integration_test(loop, consolidated, compute): ) if not compute: maybe_futures.compute() - with xr.open_zarr(filename, **read_kwargs) as restored: + with xr.open_dataset( + filename, chunks="auto", engine="zarr", **read_kwargs + ) as restored: assert isinstance(restored.var1.data, da.Array) computed = restored.compute() assert_allclose(original, computed) From 57ae5a482e2e64f0fae30038c35e14a127d52a27 Mon Sep 17 00:00:00 2001 From: keewis Date: Tue, 22 Sep 2020 17:07:32 +0200 Subject: [PATCH 22/22] silence sphinx warnings about broken rst (#4448) --- xarray/backends/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index cd1ee88f504..9049db5d602 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -352,8 +352,8 @@ def open_dataset( chunks : int or dict, optional If chunks is provided, it is used to load the new dataset into dask arrays. ``chunks={}`` loads the dataset with dask using a single - chunk for all arrays. When using ``engine="zarr"`, setting - `chunks='auto'` will create dask chunks based on the variable's zarr + chunk for all arrays. When using ``engine="zarr"``, setting + ``chunks='auto'`` will create dask chunks based on the variable's zarr chunks. lock : False or lock-like, optional Resource lock to use when reading data from disk. Only relevant when