C2SM · victoria-cherkas · Jul 21, 2022 · Jul 21, 2022 · Jul 21, 2022 · Jul 21, 2022
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,5 @@ fieldextra.diagnostic
 dist/
 *.egg-info
 doc/_build
-*/generated/*.rst
+*/generated/*.rst
+.vscode/*
diff --git a/README.md b/README.md
@@ -33,6 +33,10 @@ iconarray.get_example_data()
 
 #### grid - [modules/grid.py](modules/grid.py)
 
+**`open_dataset()`** Reads model data fron NetCDF or GRIB files into xarray.Datasets.
+
+**`select_data()`** Get model variable data from NetCDF or GRIB files into a dictionary form that allows easy access to the data using the variables shortName.
+
 **`combine_grid_information()`** This adds the required grid information from a provided grid file to your dataset if not present. It also adds coordinates encoding to each variable, which is needed to plot using psyplot.
 
 **`check_grid_information()`** Checks whether or not the grid data needs to be added. eg:
@@ -56,9 +60,13 @@ else:
 
 **`wilks()`** Returns a value for which differences are significant when data point dependencies are accounted for (based on [Wilks 2016](https://journals.ametsoc.org/view/journals/bams/97/12/bams-d-15-00267.1.xml)).
 
+#### data_explorer.py
+
 **`show_data_vars()`** Returns a table with variables in your data. The first column shows the variable name psyplot will need to plot that variable.
 This is useful if you plot GRIB data, because if `GRIB_cfVarName` is defined, cfgrib will set this as the variable name, as opposed to `GRIB_shortName` which you might expect.
 
+**`inspect_data()`** Returns an inventory of useful information about the fields in tht dataset.
+
 #### interpolate.py - [modules/interpolate.py](modules/interpolate.py)
 
 The functions in interpolate.py are used to facilitate the interpolation of ICON vector data to a regular grid, or a coarser ICON grid, for the purpose of vectorplots, eg wind plots. For psyplot we recommend to plot wind data on the regular grid as you can then scale the density of arrows in a vector plot as desired.

diff --git a/doc/index.rst b/doc/index.rst
@@ -31,7 +31,8 @@ Core functions
    latlonhash.Icon2latlon
    utilities
    utilities.ind_from_latlon
-   utilities.show_data_vars
+   data_explorer.inspect_data
+   data_explorer.show_data_vars
    interpolate
    interpolate.remap_ICON_to_regulargrid
    interpolate.remap_ICON_to_ICON
@@ -53,6 +54,7 @@ Backend functions
    grid.add_cell_data
    grid.add_edge_data
    grid.open_dataset
+   grid.select_data
 
 Plotting functions
 ------------------

diff --git a/iconarray/__init__.py b/iconarray/__init__.py
@@ -3,15 +3,11 @@
     combine_grid_information,
     grid_consistency_check,
     open_dataset,
+    select_data,
 )
 from .core.crop import Crop
+from .core.data_explorer import inspect_data, show_data_vars
 from .core.interpolate import remap_ICON_to_ICON, remap_ICON_to_regulargrid
-from .core.utilities import (
-    add_coordinates,
-    get_stats,
-    ind_from_latlon,
-    show_data_vars,
-    wilks,
-)
+from .core.utilities import add_coordinates, get_stats, ind_from_latlon, wilks
 from .plot.config import read_config
 from .utils.get_data import get_example_data
diff --git a/iconarray/backend/grid.py b/iconarray/backend/grid.py
@@ -1,7 +1,11 @@
 """The grid module contains functions relating to the grid information for ICON data, such as merging ICON data with the grid data to provide one merged dataset."""
 
+import codecs
+import logging
 import pathlib
+import sys
 
+import cfgrib
 import numpy as np
 import psyplot.project as psy
 import six
@@ -100,11 +104,23 @@ def combine_grid_information(file, grid_file):
 
     """
     if isinstance(grid_file, pathlib.PurePath) or isinstance(grid_file, str):
-        grid = psy.open_dataset(grid_file)
+        try:
+            grid = psy.open_dataset(grid_file)
+        except ValueError:
+            logging.error(f"The grid file {grid_file} was not found.")
+            sys.exit()
+    else:
+        grid = grid_file
     if isinstance(file, pathlib.PurePath) or isinstance(file, str):
-        ds = open_dataset(file).squeeze()
+        ds = open_dataset(file)
     else:
-        ds = file.squeeze()
+        ds = file
+
+    try:
+        ds = ds.squeeze()
+    except AttributeError:
+        logging.error("Model data contains more than one hypercube.")
+        sys.exit()
 
     datasetType = xr.core.dataset.Dataset
     if type(ds) != datasetType or type(grid) != datasetType:
@@ -128,7 +144,8 @@ def combine_grid_information(file, grid_file):
         ds = ds.rename(
             {"time": ds.coords["time"].attrs["standard_name"], time_coord: "time"}
         ).expand_dims("time")
-    ds.time.attrs["axis"] = "T"
+    if hasattr(ds, "time"):
+        ds.time.attrs["axis"] = "T"
 
     if "cell" in ds.dims:
         ds = add_cell_data(ds, grid)
@@ -389,9 +406,85 @@ def add_edge_data(ds, grid):
     return ds
 
 
+def select_data(file_or_obj, variables, grid_file=None):
+    """
+    Select variables from data and store in dictionary for easy access to underlying data.
+
+    Provides a dictionary mapping the variable names to the xarray.Datasets
+    holding the variables. This is especially helpful if the input file contains
+    data that cannot be represented as a single hypercube (see
+    https://github.com/ecmwf/cfgrib for more info).
+
+    Parameters
+    ----------
+    file_or_obj : file, ds or List(ds)
+        data file
+    variables : List(str)
+        list of variable names to select, empty list for all variables
+    grid_file : grid file or ds
+        grid file. If variables from different hypercubes are selected, the grid
+        might not fit all data and combine_grid_information() could fail
+
+    Returns
+    -------
+    ds_dict : dictionary
+        dictionary mapping variable names to xarray.Datasets
+
+    """
+    if not isinstance(variables, list):
+        variables = [variables]
+    if len(variables) == 0:
+        logging.info(
+            "Selecting all available variables can take a long time! "
+            "Maybe choose a few by specifing them in the variables argument."
+        )
+
+    # get data
+    if isinstance(file_or_obj, pathlib.PurePath) or isinstance(file_or_obj, str):
+        ds = open_dataset(file_or_obj)
+    else:
+        ds = file_or_obj
+    # make sure it is iterable
+    if not isinstance(ds, list):
+        ds = [ds]
+
+    ds_dict = {}
+
+    # loop through hypercubes to find the data
+    for s_ds in ds:
+        s_ds = s_ds.squeeze()
+        if len(variables) != 0:
+            if any(var in s_ds.data_vars for var in variables):
+                sel_vars = list(set(variables).intersection(s_ds.data_vars))
+                s_ds = s_ds[sel_vars]
+            else:
+                continue
+        # combine grid
+        if grid_file:
+            try:
+                s_ds = combine_grid_information(s_ds, grid_file)
+            except WrongGridException as e:
+                logging.error("The grid file does not fit all selected data.")
+                sys.exit(e)
+        # add varibales to dictionary
+        for v in s_ds.keys():
+            # data set
+            ds_dict[v] = s_ds[v]
+
+    missing_vars = list(set(variables) - set(ds_dict.keys()))
+    if len(missing_vars) != 0:
+        logging.info(f"Did not find {missing_vars} in the data.")
+
+    return ds_dict
+
+
 def open_dataset(file):
     """
-    Open either NETCDF or GRIB file, returning xarray.Dataset.
+    Open either NETCDF or GRIB file.
+
+    Returning either an xarray.Dataset, or a list of xarray.Datasets if the
+    data in the GRIB file cannot be represented as a single hypercube (see
+    https://github.com/ecmwf/cfgrib for more info)
 
     Parameters
     ----------
@@ -400,41 +493,75 @@ def open_dataset(file):
 
     Returns
     ----------
-    ds : xarray.Dataset
+    ds : xarray.Dataset or List(xarray.Datasets)
 
     Raises
     ----------
-    ValueError
-        This may suggest that a heterogeneous GRIB file is being used as the input data. Recommendation: Open the file using cfgrib.open_datasets() (returns an array of xr.Datasets)
-                and pass in the relevent Dataset.
-    Exception
-        If it not not possible to open the data with psy.open_dataset() (NETCDF) or cfgrib (GRIB) an Exception will be raised.
+    TypeError
+        If datatype is neither identified as GRIB or NETCDF.
 
     See Also
     ----------
     iconarray.backend
 
     """
-    try:
-        return psy.open_dataset(file)
-    except Exception:
-        try:
-            return psy.open_dataset(
-                file,
-                engine="cfgrib",
-                backend_kwargs={"indexpath": "", "errors": "ignore"},
-            )
-        except ValueError as e:
-            if "conflicting sizes for dimension" in str(e):
-                raise ValueError(
-                    """It appears you have a heterogeneous GRIB file.
-                Recommendation: Open the file using cfgrib.open_datasets() (returns an array of xr.Datasets)
-                and pass in the relevent Dataset."""
-                )
-            else:
-                raise ValueError(str(e))
-        except Exception as e:
-            raise Exception(str(e))
+    datatype = _identify_datatype(file)
+    if datatype == "nc":
+        return _open_NC(file)
+    elif datatype == "grib":
+        dss = _open_GRIB(file)
+        if len(dss) == 1:
+            return dss[0]
+        else:
+            return dss
+    else:
+        raise TypeError("Data is neither GRIB nor NETCDF.")
+
+
+def _identify_datatype(file):
+    if _identifyNC(file):
+        return "nc"
+    elif _identifyGRIB(file):
+        return "grib"
+    else:
+        return False
+
+
+def _identifyNC(file):
+    # Identifies if NETCDF data and return True or False.
+    msg = False
+    with codecs.open(file, "r", encoding="utf-8", errors="ignore") as fdata:
+        fd = fdata.read(3)
+    if fd in ["CDF", "HDF"]:
+        msg = True
+    return msg
+
+
+def _identifyGRIB(file):
+    # Identifies if GRIB data and return True or False.
+    with codecs.open(file, "r", encoding="utf-8", errors="ignore") as file:
+        file_type = file.read(4)
+        if file_type.lower() == "grib":
+            return True
+        else:
+            return False
+
+
+def _open_NC(file):
+    return psy.open_dataset(file)
+
+
+def _open_GRIB(file):
+    #  Returns an array of xarray.Datasets.
+    dss = cfgrib.open_datasets(
+        file,
+        backend_kwargs={
+            "indexpath": "",
+            "errors": "ignore",
+        },
+        encode_cf=("time", "geography", "vertical"),
+    )
+    return dss
 
 
 def check_vertex2cell(ds_grid: xr.Dataset):