From ffad7d92ed78cac38a7f2f060a1b7797473a7be3 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 23 Jan 2019 11:39:45 -0500 Subject: [PATCH] Added version checking and new tests --- intake_xarray/netcdf.py | 36 ++++++++++++++++++++------ tests/{util.py => conftest.py} | 6 ++--- tests/data/example_2.nc | Bin 0 -> 1736 bytes tests/test_catalog.py | 1 - tests/test_intake_xarray.py | 46 +++++++++++++++++++++++---------- 5 files changed, 64 insertions(+), 25 deletions(-) rename tests/{util.py => conftest.py} (87%) create mode 100644 tests/data/example_2.nc diff --git a/intake_xarray/netcdf.py b/intake_xarray/netcdf.py index 36c64bf..3786881 100644 --- a/intake_xarray/netcdf.py +++ b/intake_xarray/netcdf.py @@ -1,5 +1,10 @@ # -*- coding: utf-8 -*- -import xarray as xr +from distutils.version import LooseVersion +try: + import xarray as xr + XARRAY_VERSION = LooseVersion(xr.__version__) +except ImportError: + XARRAY_VERSION = None from intake.source.base import PatternMixin from intake.source.utils import reverse_format from .base import DataSourceMixin @@ -10,38 +15,47 @@ class NetCDFSource(DataSourceMixin, PatternMixin): Parameters ---------- - urlpath: str + urlpath : str Path to source file. May include glob "*" characters, format pattern strings, or list. Some examples: - - ``{{ CATALOG_DIR }}data/air.nc`` - - ``{{ CATALOG_DIR }}data/*.nc`` - - ``{{ CATALOG_DIR }}data/air_{year}.nc`` - chunks: int or dict + - ``{{ CATALOG_DIR }}/data/air.nc`` + - ``{{ CATALOG_DIR }}/data/*.nc`` + - ``{{ CATALOG_DIR }}/data/air_{year}.nc`` + chunks : int or dict, optional Chunks is used to load the new dataset into dask arrays. ``chunks={}`` loads the dataset with dask using a single chunk for all arrays. - path_as_pattern: bool or str, optional + concat_dim : str, optional + Name of dimension along which to concatenate the files. Can + be new or pre-existing. Default is 'concat_dim'. + path_as_pattern : bool or str, optional Whether to treat the path as a pattern (ie. ``data_{field}.nc``) and create new coodinates in the output corresponding to pattern fields. If str, is treated as pattern to match on. Default is True. """ name = 'netcdf' - def __init__(self, urlpath, chunks, xarray_kwargs=None, metadata=None, + def __init__(self, urlpath, chunks=None, concat_dim='concat_dim', + xarray_kwargs=None, metadata=None, path_as_pattern=True, **kwargs): self.path_as_pattern = path_as_pattern self.urlpath = urlpath self.chunks = chunks + self.concat_dim = concat_dim self._kwargs = xarray_kwargs or kwargs self._ds = None super(NetCDFSource, self).__init__(metadata=metadata) def _open_dataset(self): + if not XARRAY_VERSION: + raise ImportError("xarray not available") url = self.urlpath kwargs = self._kwargs if "*" in url or isinstance(url, list): _open_dataset = xr.open_mfdataset + if 'concat_dim' not in kwargs.keys(): + kwargs.update(concat_dim=self.concat_dim) if self.pattern: kwargs.update(preprocess=self._add_path_to_ds) else: @@ -52,6 +66,12 @@ def _open_dataset(self): def _add_path_to_ds(self, ds): """Adding path info to a coord for a particular file """ + if not (XARRAY_VERSION > '0.11.1'): + raise ImportError("Your version of xarray is '{}'. " + "The insurance that source path is available on output of " + "open_dataset was added in 0.11.2, so " + "pattern urlpaths are not supported.".format(XARRAY_VERSION)) + var = next(var for var in ds) new_coords = reverse_format(self.pattern, ds[var].encoding['source']) return ds.assign_coords(**new_coords) diff --git a/tests/util.py b/tests/conftest.py similarity index 87% rename from tests/util.py rename to tests/conftest.py index b59c906..9d044a9 100644 --- a/tests/util.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -import os +import posixpath import pytest import shutil import tempfile @@ -11,11 +11,11 @@ TEST_DATA_DIR = 'tests/data' TEST_DATA = 'example_1.nc' -TEST_URLPATH = os.path.join(TEST_DATA_DIR, TEST_DATA) +TEST_URLPATH = posixpath.join(TEST_DATA_DIR, TEST_DATA) @pytest.fixture -def cdf_source(): +def netcdf_source(): return NetCDFSource(TEST_URLPATH, {}) diff --git a/tests/data/example_2.nc b/tests/data/example_2.nc new file mode 100644 index 0000000000000000000000000000000000000000..5775622d0ef85828b436dffcd21366f7538fc55c GIT binary patch literal 1736 zcmeHGF-s#s6dp~Yc*(&DBG?>TUSkmhf^g>+CsvB!AIR+`lVo7BE3>l!!NS_gHda?y z`V%ZIEY`|aYp=Y6D~0%dvl+>W;vdMsmp5IGu&RyscVRC2^#K-J~sbu$S3`%+ZS~^MSIJ z{R3K{1h9^aeB`CSpp&@Uj3eKWXI0io6WPqThLtQsDTW6Szoo4JuK>~gGj4((?oIC&A|Jxw_D*KzjIn%L&8U#czxNE z%WL;?{*P_hHBzR{I5D;u*=e+d7N@8yK@--K$InmulBec*WRy~Q>ih*9=ggh>rmW@c zZ_TsNS6Zu|kr``Do=+&rVf|Ym2Q__*W2uMtkp`)XTQaC`Y^<-=SL@>%@XcLekNb(w z0A>_xz}L}e^aHTK3GfTJ0CM0KxCb79zi9m%*vCwlkmTPH^q*p?!SoHh{suz)xi)w5 zo6Q`c9S+aD&sexJJPR9*#y7?hqn(7yK|Dk);qqV|$JqXt)9G|V;5m=|Ccs{fYn+D> R#B**A-U+x|pI1Acc>;h`|0@6h literal 0 HcmV?d00001 diff --git a/tests/test_catalog.py b/tests/test_catalog.py index 5a76f4a..38b031e 100644 --- a/tests/test_catalog.py +++ b/tests/test_catalog.py @@ -4,7 +4,6 @@ import pytest from intake import open_catalog -from .util import dataset # noqa @pytest.fixture diff --git a/tests/test_intake_xarray.py b/tests/test_intake_xarray.py index a0d9f44..277f9bf 100644 --- a/tests/test_intake_xarray.py +++ b/tests/test_intake_xarray.py @@ -7,12 +7,10 @@ here = os.path.dirname(__file__) -from .util import TEST_URLPATH, cdf_source, zarr_source, dataset # noqa - -@pytest.mark.parametrize('source', ['cdf', 'zarr']) -def test_discover(source, cdf_source, zarr_source, dataset): - source = {'cdf': cdf_source, 'zarr': zarr_source}[source] +@pytest.mark.parametrize('source', ['netcdf', 'zarr']) +def test_discover(source, netcdf_source, zarr_source, dataset): + source = {'netcdf': netcdf_source, 'zarr': zarr_source}[source] r = source.discover() assert r['datashape'] is None @@ -25,9 +23,9 @@ def test_discover(source, cdf_source, zarr_source, dataset): assert set(source.metadata['coords']) == set(dataset.coords.keys()) -@pytest.mark.parametrize('source', ['cdf', 'zarr']) -def test_read(source, cdf_source, zarr_source, dataset): - source = {'cdf': cdf_source, 'zarr': zarr_source}[source] +@pytest.mark.parametrize('source', ['netcdf', 'zarr']) +def test_read(source, netcdf_source, zarr_source, dataset): + source = {'netcdf': netcdf_source, 'zarr': zarr_source}[source] ds = source.read_chunked() assert ds.temp.chunks @@ -38,8 +36,8 @@ def test_read(source, cdf_source, zarr_source, dataset): assert np.all(ds.rh == dataset.rh) -def test_read_partition_cdf(cdf_source): - source = cdf_source +def test_read_partition_netcdf(netcdf_source): + source = netcdf_source with pytest.raises(TypeError): source.read_partition(None) out = source.read_partition(('temp', 0, 0, 0, 0)) @@ -48,6 +46,28 @@ def test_read_partition_cdf(cdf_source): assert np.all(out == expected) +def test_read_list_of_netcdf_files(): + from intake_xarray.netcdf import NetCDFSource + source = NetCDFSource([ + os.path.join(here, 'data', 'example_1.nc'), + os.path.join(here, 'data', 'example_2.nc'), + ]) + d = source.to_dask() + assert d.dims == {'lat': 5, 'lon': 10, 'level': 4, 'time': 1, + 'concat_dim': 2} + + +def test_read_glob_pattern_of_netcdf_files(): + from intake_xarray.netcdf import NetCDFSource + + source = NetCDFSource(os.path.join(here, 'data', 'example_{num: d}.nc'), + concat_dim='num') + d = source.to_dask() + assert d.dims == {'lat': 5, 'lon': 10, 'level': 4, 'time': 1, + 'num': 2} + assert (d.num.data == np.array([1, 2])).all() + + def test_read_partition_zarr(zarr_source): source = zarr_source with pytest.raises(TypeError): @@ -57,9 +77,9 @@ def test_read_partition_zarr(zarr_source): assert np.all(out == expected) -@pytest.mark.parametrize('source', ['cdf', 'zarr']) -def test_to_dask(source, cdf_source, zarr_source, dataset): - source = {'cdf': cdf_source, 'zarr': zarr_source}[source] +@pytest.mark.parametrize('source', ['netcdf', 'zarr']) +def test_to_dask(source, netcdf_source, zarr_source, dataset): + source = {'netcdf': netcdf_source, 'zarr': zarr_source}[source] ds = source.to_dask() assert ds.dims == dataset.dims