From bcfd75931ef80f1bfd4ab4922ae2f6a5d7a92554 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 21 Dec 2018 23:42:13 +0000 Subject: [PATCH 1/7] Add source encoding if not already present when opening dataset --- xarray/backends/api.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b4297801309..ccc24665e68 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -338,9 +338,6 @@ def maybe_decode_store(store, lock=False): else: raise ValueError('unrecognized engine for open_dataset: %r' % engine) - - with close_on_error(store): - return maybe_decode_store(store) else: if engine is not None and engine != 'scipy': raise ValueError('can only read file-like objects with ' @@ -348,7 +345,15 @@ def maybe_decode_store(store, lock=False): # assume filename_or_obj is a file-like object store = backends.ScipyDataStore(filename_or_obj) - return maybe_decode_store(store) + with close_on_error(store): + ds = maybe_decode_store(store) + + # Ensure source filename always stored in dataset object (GH issue #2550) + if 'source' not in ds.encoding.keys(): + if isinstance(filename_or_obj, basestring): + ds.encoding['source'] = filename_or_obj + + return ds def open_dataarray(filename_or_obj, group=None, decode_cf=True, From dfc55b17347e79c9c90b7eeebb4dac386c51b909 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 21 Dec 2018 23:42:50 +0000 Subject: [PATCH 2/7] Test source encoding present --- xarray/tests/test_backends.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index bad9e99c042..1888f0767be 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3356,3 +3356,14 @@ def test_no_warning_from_dask_effective_get(): ds = Dataset() ds.to_netcdf(tmpfile) assert len(record) == 0 + + +@requires_scipy_or_netCDF4 +def test_source_encoding_always_present(): + # Test for GH issue #2550. + rnddata = np.random.randn(10) + original = Dataset({'foo': ('x', rnddata)}) + with create_tmp_file() as tmp: + original.to_netcdf(tmp) + with open_dataset(tmp) as ds: + assert ds.encoding['source'] == tmp From 78587990cfdfdfb74f456289b41be5e0e3036496 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 21 Dec 2018 23:50:42 +0000 Subject: [PATCH 3/7] Updated what's new --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5143672a0c9..1c6df91f23a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -45,6 +45,9 @@ Enhancements "dayofyear" and "dayofweek" accessors (:issue:`2597`). By `Spencer Clark `_. - Support Dask ``HighLevelGraphs`` by `Matthew Rocklin `_. +- Datasets are now guaranteed to have a ``'source'`` encoding, so the source + file name is always stored (:issue:`2550`). + By `Tom Nicholas `_. Bug fixes From bcc59681c75cc4d01b2b6a22b6bb1e8a5561f05c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 22 Dec 2018 17:23:18 +0000 Subject: [PATCH 4/7] Revert "Updated what's new" This reverts commit 78587990cfdfdfb74f456289b41be5e0e3036496. --- doc/whats-new.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a18aa99cb9a..28ac227df7d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -53,9 +53,6 @@ Enhancements "dayofyear" and "dayofweek" accessors (:issue:`2597`). By `Spencer Clark `_. - Support Dask ``HighLevelGraphs`` by `Matthew Rocklin `_. -- Datasets are now guaranteed to have a ``'source'`` encoding, so the source - file name is always stored (:issue:`2550`). - By `Tom Nicholas `_. - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` now supports the ``loffset`` kwarg just like Pandas. From 8d62c51ccd22670dafc2e9c343c6e1bc752daa8f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 23 Dec 2018 19:57:18 +0000 Subject: [PATCH 5/7] Don't close file-like objects --- xarray/backends/api.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ccc24665e68..4b85b348caa 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -300,6 +300,7 @@ def maybe_decode_store(store, lock=False): if isinstance(filename_or_obj, backends.AbstractDataStore): store = filename_or_obj + ds = maybe_decode_store(store) elif isinstance(filename_or_obj, basestring): if (isinstance(filename_or_obj, bytes) and @@ -338,18 +339,19 @@ def maybe_decode_store(store, lock=False): else: raise ValueError('unrecognized engine for open_dataset: %r' % engine) + + with close_on_error(store): + ds = maybe_decode_store(store) else: if engine is not None and engine != 'scipy': raise ValueError('can only read file-like objects with ' "default engine or engine='scipy'") # assume filename_or_obj is a file-like object store = backends.ScipyDataStore(filename_or_obj) - - with close_on_error(store): ds = maybe_decode_store(store) # Ensure source filename always stored in dataset object (GH issue #2550) - if 'source' not in ds.encoding.keys(): + if 'source' not in ds.encoding: if isinstance(filename_or_obj, basestring): ds.encoding['source'] = filename_or_obj From d753780fc3d85e810862c1b2d3a4f8e1f69007f1 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 23 Dec 2018 20:01:35 +0000 Subject: [PATCH 6/7] Updated whats's new --- doc/whats-new.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 28ac227df7d..56a28ef7d31 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -53,10 +53,12 @@ Enhancements "dayofyear" and "dayofweek" accessors (:issue:`2597`). By `Spencer Clark `_. - Support Dask ``HighLevelGraphs`` by `Matthew Rocklin `_. - - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` now supports the ``loffset`` kwarg just like Pandas. By `Deepak Cherian `_ +- Datasets are now guaranteed to have a ``'source'`` encoding, so the source + file name is always stored (:issue:`2550`). + By `Tom Nicholas `_. Bug fixes ~~~~~~~~~ From f76593ed6a1738e356fb7f33d947c87df3318ca0 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sat, 29 Dec 2018 16:21:44 -0800 Subject: [PATCH 7/7] DOC: document source encoding for datasets --- doc/io.rst | 24 +++++++++++++++--------- xarray/backends/api.py | 3 +++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/io.rst b/doc/io.rst index 682fbf5202e..151f5eb740f 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -197,24 +197,30 @@ turn this decoding off manually. .. _CF conventions: http://cfconventions.org/ You can view this encoding information (among others) in the -:py:attr:`DataArray.encoding ` attribute: +:py:attr:`DataArray.encoding ` and +:py:attr:`DataArray.encoding ` attributes: .. ipython:: :verbatim: In [1]: ds_disk['y'].encoding Out[1]: - {'calendar': u'proleptic_gregorian', - 'chunksizes': None, + {'zlib': False, + 'shuffle': False, 'complevel': 0, - 'contiguous': True, - 'dtype': dtype('float64'), 'fletcher32': False, - 'least_significant_digit': None, - 'shuffle': False, + 'contiguous': True, + 'chunksizes': None, 'source': 'saved_on_disk.nc', - 'units': u'days since 2000-01-01 00:00:00', - 'zlib': False} + 'original_shape': (5,), + 'dtype': dtype('int64'), + 'units': 'days since 2000-01-01 00:00:00', + 'calendar': 'proleptic_gregorian'} + + In [9]: ds_disk.encoding + Out[9]: + {'unlimited_dims': set(), + 'source': 'saved_on_disk.nc'} Note that all operations that manipulate variables other than indexing will remove encoding information. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e858c9a8cbf..244b540d0ca 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -491,6 +491,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, lock=None, data_vars='all', coords='different', autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. + Requires dask to be installed. See documentation for details on dask [1]. Attributes from the first dataset file are used for the combined dataset. @@ -530,6 +531,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, of all non-null values. preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. + You can find the file-name from which each dataset was loaded in + ``ds.encoding['source']``. engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib'}, optional Engine to use when reading files. If not provided, the default engine