Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Backend grid improvements #19

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ fieldextra.diagnostic
dist/
*.egg-info
doc/_build
*/generated/*.rst
*/generated/*.rst
.vscode/*
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ iconarray.get_example_data()

#### grid - [modules/grid.py](modules/grid.py)

**`open_dataset()`** Reads model data fron NetCDF or GRIB files into xarray.Datasets.

**`select_data()`** Get model variable data from NetCDF or GRIB files into a dictionary form that allows easy access to the data using the variables shortName.

**`combine_grid_information()`** This adds the required grid information from a provided grid file to your dataset if not present. It also adds coordinates encoding to each variable, which is needed to plot using psyplot.

**`check_grid_information()`** Checks whether or not the grid data needs to be added. eg:
Expand All @@ -56,9 +60,13 @@ else:

**`wilks()`** Returns a value for which differences are significant when data point dependencies are accounted for (based on [Wilks 2016](https://journals.ametsoc.org/view/journals/bams/97/12/bams-d-15-00267.1.xml)).

#### data_explorer.py

**`show_data_vars()`** Returns a table with variables in your data. The first column shows the variable name psyplot will need to plot that variable.
This is useful if you plot GRIB data, because if `GRIB_cfVarName` is defined, cfgrib will set this as the variable name, as opposed to `GRIB_shortName` which you might expect.

**`inspect_data()`** Returns an inventory of useful information about the fields in tht dataset.

#### interpolate.py - [modules/interpolate.py](modules/interpolate.py)

The functions in interpolate.py are used to facilitate the interpolation of ICON vector data to a regular grid, or a coarser ICON grid, for the purpose of vectorplots, eg wind plots. For psyplot we recommend to plot wind data on the regular grid as you can then scale the density of arrows in a vector plot as desired.
Expand Down
4 changes: 3 additions & 1 deletion doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ Core functions
latlonhash.Icon2latlon
utilities
utilities.ind_from_latlon
utilities.show_data_vars
data_explorer.inspect_data
data_explorer.show_data_vars
interpolate
interpolate.remap_ICON_to_regulargrid
interpolate.remap_ICON_to_ICON
Expand All @@ -53,6 +54,7 @@ Backend functions
grid.add_cell_data
grid.add_edge_data
grid.open_dataset
grid.select_data

Plotting functions
------------------
Expand Down
10 changes: 3 additions & 7 deletions iconarray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,11 @@
combine_grid_information,
grid_consistency_check,
open_dataset,
select_data,
)
from .core.crop import Crop
from .core.data_explorer import inspect_data, show_data_vars
from .core.interpolate import remap_ICON_to_ICON, remap_ICON_to_regulargrid
from .core.utilities import (
add_coordinates,
get_stats,
ind_from_latlon,
show_data_vars,
wilks,
)
from .core.utilities import add_coordinates, get_stats, ind_from_latlon, wilks
from .plot.config import read_config
from .utils.get_data import get_example_data
189 changes: 158 additions & 31 deletions iconarray/backend/grid.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
"""The grid module contains functions relating to the grid information for ICON data, such as merging ICON data with the grid data to provide one merged dataset."""

import codecs
import logging
import pathlib
import sys

import cfgrib
import numpy as np
import psyplot.project as psy
import six
Expand Down Expand Up @@ -100,11 +104,23 @@ def combine_grid_information(file, grid_file):

"""
if isinstance(grid_file, pathlib.PurePath) or isinstance(grid_file, str):
grid = psy.open_dataset(grid_file)
try:
grid = psy.open_dataset(grid_file)
except ValueError:
logging.error(f"The grid file {grid_file} was not found.")
sys.exit()
else:
grid = grid_file
if isinstance(file, pathlib.PurePath) or isinstance(file, str):
ds = open_dataset(file).squeeze()
ds = open_dataset(file)
else:
ds = file.squeeze()
ds = file

try:
ds = ds.squeeze()
except AttributeError:
logging.error("Model data contains more than one hypercube.")
sys.exit()

datasetType = xr.core.dataset.Dataset
if type(ds) != datasetType or type(grid) != datasetType:
Expand All @@ -128,7 +144,8 @@ def combine_grid_information(file, grid_file):
ds = ds.rename(
{"time": ds.coords["time"].attrs["standard_name"], time_coord: "time"}
).expand_dims("time")
ds.time.attrs["axis"] = "T"
if hasattr(ds, "time"):
ds.time.attrs["axis"] = "T"

if "cell" in ds.dims:
ds = add_cell_data(ds, grid)
Expand Down Expand Up @@ -389,9 +406,85 @@ def add_edge_data(ds, grid):
return ds


def select_data(file_or_obj, variables, grid_file=None):
"""
Select variables from data and store in dictionary for easy access to underlying data.

Provides a dictionary mapping the variable names to the xarray.Datasets
holding the variables. This is especially helpful if the input file contains
data that cannot be represented as a single hypercube (see
https://github.com/ecmwf/cfgrib for more info).

Parameters
----------
file_or_obj : file, ds or List(ds)
data file
variables : List(str)
list of variable names to select, empty list for all variables
grid_file : grid file or ds
grid file. If variables from different hypercubes are selected, the grid
might not fit all data and combine_grid_information() could fail

Returns
-------
ds_dict : dictionary
dictionary mapping variable names to xarray.Datasets

"""
if not isinstance(variables, list):
variables = [variables]
if len(variables) == 0:
logging.info(
"Selecting all available variables can take a long time! "
"Maybe choose a few by specifing them in the variables argument."
)

# get data
if isinstance(file_or_obj, pathlib.PurePath) or isinstance(file_or_obj, str):
ds = open_dataset(file_or_obj)
else:
ds = file_or_obj
# make sure it is iterable
if not isinstance(ds, list):
ds = [ds]

ds_dict = {}

# loop through hypercubes to find the data
for s_ds in ds:
s_ds = s_ds.squeeze()
if len(variables) != 0:
if any(var in s_ds.data_vars for var in variables):
sel_vars = list(set(variables).intersection(s_ds.data_vars))
s_ds = s_ds[sel_vars]
else:
continue
# combine grid
if grid_file:
try:
s_ds = combine_grid_information(s_ds, grid_file)
except WrongGridException as e:
logging.error("The grid file does not fit all selected data.")
sys.exit(e)
# add varibales to dictionary
for v in s_ds.keys():
# data set
ds_dict[v] = s_ds[v]

missing_vars = list(set(variables) - set(ds_dict.keys()))
if len(missing_vars) != 0:
logging.info(f"Did not find {missing_vars} in the data.")

return ds_dict


def open_dataset(file):
"""
Open either NETCDF or GRIB file, returning xarray.Dataset.
Open either NETCDF or GRIB file.

Returning either an xarray.Dataset, or a list of xarray.Datasets if the
data in the GRIB file cannot be represented as a single hypercube (see
https://github.com/ecmwf/cfgrib for more info)

Parameters
----------
Expand All @@ -400,41 +493,75 @@ def open_dataset(file):

Returns
----------
ds : xarray.Dataset
ds : xarray.Dataset or List(xarray.Datasets)

Raises
----------
ValueError
This may suggest that a heterogeneous GRIB file is being used as the input data. Recommendation: Open the file using cfgrib.open_datasets() (returns an array of xr.Datasets)
and pass in the relevent Dataset.
Exception
If it not not possible to open the data with psy.open_dataset() (NETCDF) or cfgrib (GRIB) an Exception will be raised.
TypeError
If datatype is neither identified as GRIB or NETCDF.

See Also
----------
iconarray.backend

"""
try:
return psy.open_dataset(file)
except Exception:
try:
return psy.open_dataset(
file,
engine="cfgrib",
backend_kwargs={"indexpath": "", "errors": "ignore"},
)
except ValueError as e:
if "conflicting sizes for dimension" in str(e):
raise ValueError(
"""It appears you have a heterogeneous GRIB file.
Recommendation: Open the file using cfgrib.open_datasets() (returns an array of xr.Datasets)
and pass in the relevent Dataset."""
)
else:
raise ValueError(str(e))
except Exception as e:
raise Exception(str(e))
datatype = _identify_datatype(file)
if datatype == "nc":
return _open_NC(file)
elif datatype == "grib":
dss = _open_GRIB(file)
if len(dss) == 1:
return dss[0]
else:
return dss
else:
raise TypeError("Data is neither GRIB nor NETCDF.")


def _identify_datatype(file):
if _identifyNC(file):
return "nc"
elif _identifyGRIB(file):
return "grib"
else:
return False


def _identifyNC(file):
# Identifies if NETCDF data and return True or False.
msg = False
with codecs.open(file, "r", encoding="utf-8", errors="ignore") as fdata:
fd = fdata.read(3)
if fd in ["CDF", "HDF"]:
msg = True
return msg


def _identifyGRIB(file):
# Identifies if GRIB data and return True or False.
with codecs.open(file, "r", encoding="utf-8", errors="ignore") as file:
file_type = file.read(4)
if file_type.lower() == "grib":
return True
else:
return False


def _open_NC(file):
return psy.open_dataset(file)


def _open_GRIB(file):
# Returns an array of xarray.Datasets.
dss = cfgrib.open_datasets(
file,
backend_kwargs={
"indexpath": "",
"errors": "ignore",
},
encode_cf=("time", "geography", "vertical"),
)
return dss


def check_vertex2cell(ds_grid: xr.Dataset):
Expand Down
Loading