From 59c33474203141816ba91f326aa54e1bd3c9ac3c Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Jun 2024 10:49:04 -0600 Subject: [PATCH 1/2] Add geometry encoding and decoding functions. These differ from `shapely_to_cf` and `cf_to_shapely` by returning all variables. Those function, simply encode and decode geometry-related variables. --- cf_xarray/__init__.py | 2 + cf_xarray/geometry.py | 201 +++++++++++++++++++++++++++++-- cf_xarray/tests/test_geometry.py | 43 +++++-- doc/api.rst | 10 ++ doc/coding.md | 4 + doc/geometry.md | 34 +++++- 6 files changed, 266 insertions(+), 28 deletions(-) diff --git a/cf_xarray/__init__.py b/cf_xarray/__init__.py index 782317e3..6ae0a916 100644 --- a/cf_xarray/__init__.py +++ b/cf_xarray/__init__.py @@ -9,4 +9,6 @@ from .options import set_options # noqa from .utils import _get_version +from . import geometry # noqa + __version__ = _get_version() diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index 16b612e4..1e36e941 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -1,11 +1,188 @@ from __future__ import annotations +import copy from collections.abc import Sequence import numpy as np import pandas as pd import xarray as xr +GEOMETRY_CONTAINER_NAME = "geometry_container" + +__all__ = [ + "decode_geometries", + "encode_geometries", + "cf_to_shapely", + "shapely_to_cf", +] + + +def decode_geometries(encoded: xr.Dataset) -> xr.Dataset: + """ + Decode CF encoded geometries to a numpy object array containing shapely geometries. + + Parameters + ---------- + encoded : Dataset + A Xarray Dataset containing encoded geometries. + + Returns + ------- + Dataset + A Xarray Dataset containing decoded geometries. + + See Also + -------- + shapely_to_cf + cf_to_shapely + encode_geometries + """ + if GEOMETRY_CONTAINER_NAME not in encoded._variables: + raise NotImplementedError( + f"Currently only a single geometry variable named {GEOMETRY_CONTAINER_NAME!r} is supported." + "A variable by this name is not present in the provided dataset." + ) + + enc_geom_var = encoded[GEOMETRY_CONTAINER_NAME] + geom_attrs = enc_geom_var.attrs + # Grab the coordinates attribute + geom_attrs.update(enc_geom_var.encoding) + + geom_var = cf_to_shapely(encoded).variable + + todrop = (GEOMETRY_CONTAINER_NAME,) + tuple( + s + for s in " ".join( + geom_attrs.get(attr, "") + for attr in [ + "interior_ring", + "node_coordinates", + "node_count", + "part_node_count", + "coordinates", + ] + ).split(" ") + if s + ) + decoded = encoded.drop_vars(todrop) + + name = geom_attrs.get("variable_name", None) + if name in decoded.dims: + decoded = decoded.assign_coords( + xr.Coordinates(coords={name: geom_var}, indexes={}) + ) + else: + decoded[name] = geom_var + + # Is this a good idea? We are deleting information. + for var in decoded._variables.values(): + if var.attrs.get("geometry") == GEOMETRY_CONTAINER_NAME: + var.attrs.pop("geometry") + return decoded + + +def encode_geometries(ds: xr.Dataset): + """ + Encode any discovered geometry variables using the CF conventions. + + Practically speaking, geometry variables are numpy object arrays where the first + element is a shapely geometry. + + .. warning:: + + Only a single geometry variable is supported at present. Contributions to fix this + are welcome. + + Parameters + ---------- + ds : Dataset + Dataset containing at least one geometry variable. + + Returns + ------- + Dataset + Where all geometry variables are encoded. + + See Also + -------- + shapely_to_cf + cf_to_shapely + """ + from shapely import ( + LineString, + MultiLineString, + MultiPoint, + MultiPolygon, + Point, + Polygon, + ) + + SHAPELY_TYPES = ( + Point, + LineString, + Polygon, + MultiPoint, + MultiLineString, + MultiPolygon, + ) + + geom_var_names = [ + name + for name, var in ds._variables.items() + if var.dtype == "O" and isinstance(var.data.flat[0], SHAPELY_TYPES) + ] + if not geom_var_names: + return ds + + if to_drop := set(geom_var_names) & set(ds._indexes): + # e.g. xvec GeometryIndex + ds = ds.drop_indexes(to_drop) + + if len(geom_var_names) > 1: + raise NotImplementedError( + "Multiple geometry variables are not supported at this time. " + "Contributions to fix this are welcome. " + f"Detected geometry variables are {geom_var_names!r}" + ) + + (name,) = geom_var_names + variables = {} + # If `name` is a dimension name, then we need to drop it. Otherwise we don't + # So set errors="ignore" + variables.update( + shapely_to_cf(ds[name]).drop_vars(name, errors="ignore")._variables + ) + + geom_var = ds[name] + + more_updates = {} + for varname, var in ds._variables.items(): + if varname == name: + continue + if name in var.dims: + var = var.copy() + var._attrs = copy.deepcopy(var._attrs) + var.attrs["geometry"] = GEOMETRY_CONTAINER_NAME + # The grid_mapping and coordinates attributes can be carried by the geometry container + # variable provided they are also carried by the data variables associated with the container. + if to_add := geom_var.attrs.get("coordinates", ""): + var.attrs["coordinates"] = var.attrs.get("coordinates", "") + to_add + more_updates[varname] = var + variables.update(more_updates) + + # WARNING: cf-xarray specific convention. + # For vector data cubes, `name` is a dimension name. + # By encoding to CF, we have + # encoded the information in that variable across many different + # variables (e.g. node_count) with `name` as a dimension. + # We have to record `name` somewhere so that we reconstruct + # a geometry variable of the right name at decode-time. + variables[GEOMETRY_CONTAINER_NAME].attrs["variable_name"] = name + + encoded = xr.Dataset(variables) + + return encoded + def reshape_unique_geometries( ds: xr.Dataset, @@ -119,13 +296,15 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None f"Mixed geometry types are not supported in CF-compliant datasets. Got {types}" ) + ds[GEOMETRY_CONTAINER_NAME].attrs.update(coordinates="crd_x crd_y") + # Special treatment of selected grid mappings if grid_mapping == "longitude_latitude": # Special case for longitude_latitude grid mapping ds = ds.rename(crd_x="lon", crd_y="lat") ds.lon.attrs.update(units="degrees_east", standard_name="longitude") ds.lat.attrs.update(units="degrees_north", standard_name="latitude") - ds.geometry_container.attrs.update(coordinates="lon lat") + ds[GEOMETRY_CONTAINER_NAME].attrs.update(coordinates="lon lat") ds.x.attrs.update(units="degrees_east", standard_name="longitude") ds.y.attrs.update(units="degrees_north", standard_name="latitude") elif grid_mapping is not None: @@ -157,7 +336,7 @@ def cf_to_shapely(ds: xr.Dataset): ---------- Please refer to the CF conventions document: http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#geometries """ - geom_type = ds.geometry_container.attrs["geometry_type"] + geom_type = ds[GEOMETRY_CONTAINER_NAME].attrs["geometry_type"] if geom_type == "point": geometries = cf_to_points(ds) elif geom_type == "line": @@ -235,7 +414,7 @@ def points_to_cf(pts: xr.DataArray | Sequence): # Special case when we have no MultiPoints if (ds.node_count == 1).all(): ds = ds.drop_vars("node_count") - del ds.geometry_container.attrs["node_count"] + del ds[GEOMETRY_CONTAINER_NAME].attrs["node_count"] return ds @@ -259,7 +438,7 @@ def cf_to_points(ds: xr.Dataset): from shapely.geometry import MultiPoint, Point # Shorthand for convenience - geo = ds.geometry_container.attrs + geo = ds[GEOMETRY_CONTAINER_NAME].attrs # The features dimension name, defaults to the one of 'node_count' or the dimension of the coordinates, if present. feat_dim = None @@ -267,10 +446,10 @@ def cf_to_points(ds: xr.Dataset): xcoord_name, _ = geo["coordinates"].split(" ") (feat_dim,) = ds[xcoord_name].dims - x_name, y_name = ds.geometry_container.attrs["node_coordinates"].split(" ") + x_name, y_name = ds[GEOMETRY_CONTAINER_NAME].attrs["node_coordinates"].split(" ") xy = np.stack([ds[x_name].values, ds[y_name].values], axis=-1) - node_count_name = ds.geometry_container.attrs.get("node_count") + node_count_name = ds[GEOMETRY_CONTAINER_NAME].attrs.get("node_count") if node_count_name is None: # No node_count means all geometries are single points (node_count = 1) # And if we had no coordinates, then the dimension defaults to "features" @@ -363,7 +542,7 @@ def lines_to_cf(lines: xr.DataArray | Sequence): # Special case when we have no MultiLines if len(ds.part_node_count) == len(ds.node_count): ds = ds.drop_vars("part_node_count") - del ds.geometry_container.attrs["part_node_count"] + del ds[GEOMETRY_CONTAINER_NAME].attrs["part_node_count"] return ds @@ -387,7 +566,7 @@ def cf_to_lines(ds: xr.Dataset): from shapely import GeometryType, from_ragged_array # Shorthand for convenience - geo = ds.geometry_container.attrs + geo = ds[GEOMETRY_CONTAINER_NAME].attrs # The features dimension name, defaults to the one of 'node_count' # or the dimension of the coordinates, if present. @@ -503,12 +682,12 @@ def polygons_to_cf(polygons: xr.DataArray | Sequence): # Special case when we have no MultiPolygons and no holes if len(ds.part_node_count) == len(ds.node_count): ds = ds.drop_vars("part_node_count") - del ds.geometry_container.attrs["part_node_count"] + del ds[GEOMETRY_CONTAINER_NAME].attrs["part_node_count"] # Special case when we have no holes if (ds.interior_ring == 0).all(): ds = ds.drop_vars("interior_ring") - del ds.geometry_container.attrs["interior_ring"] + del ds[GEOMETRY_CONTAINER_NAME].attrs["interior_ring"] return ds @@ -532,7 +711,7 @@ def cf_to_polygons(ds: xr.Dataset): from shapely import GeometryType, from_ragged_array # Shorthand for convenience - geo = ds.geometry_container.attrs + geo = ds[GEOMETRY_CONTAINER_NAME].attrs # The features dimension name, defaults to the one of 'part_node_count' # or the dimension of the coordinates, if present. diff --git a/cf_xarray/tests/test_geometry.py b/cf_xarray/tests/test_geometry.py index 458355e9..374bb21c 100644 --- a/cf_xarray/tests/test_geometry.py +++ b/cf_xarray/tests/test_geometry.py @@ -4,9 +4,23 @@ import cf_xarray as cfxr +from ..geometry import decode_geometries, encode_geometries from . import requires_shapely +@pytest.fixture +def polygon_geometry() -> xr.DataArray: + from shapely.geometry import Polygon + + # empty/fill workaround to avoid numpy deprecation(warning) due to the array interface of shapely geometries. + geoms = np.empty(2, dtype=object) + geoms[:] = [ + Polygon(([50, 0], [40, 15], [30, 0])), + Polygon(([70, 50], [60, 65], [50, 50])), + ] + return xr.DataArray(geoms, dims=("index",), name="geometry") + + @pytest.fixture def geometry_ds(): from shapely.geometry import MultiPoint, Point @@ -127,18 +141,9 @@ def geometry_line_without_multilines_ds(): @pytest.fixture -def geometry_polygon_without_holes_ds(): - from shapely.geometry import Polygon - - # empty/fill workaround to avoid numpy deprecation(warning) due to the array interface of shapely geometries. - geoms = np.empty(2, dtype=object) - geoms[:] = [ - Polygon(([50, 0], [40, 15], [30, 0])), - Polygon(([70, 50], [60, 65], [50, 50])), - ] - +def geometry_polygon_without_holes_ds(polygon_geometry): + shp_da = polygon_geometry ds = xr.Dataset() - shp_da = xr.DataArray(geoms, dims=("index",), name="geometry") cf_ds = ds.assign( x=xr.DataArray( @@ -521,3 +526,19 @@ def test_reshape_unique_geometries(geometry_ds): in_ds = in_ds.assign(geometry=geoms) with pytest.raises(ValueError, match="The geometry variable must be 1D"): cfxr.geometry.reshape_unique_geometries(in_ds) + + +@requires_shapely +def test_encode_decode(geometry_ds, polygon_geometry): + + geom_dim_ds = xr.Dataset() + geom_dim_ds = geom_dim_ds.assign_coords( + xr.Coordinates( + coords={"geoms": xr.Variable("geoms", polygon_geometry.variable)}, + indexes={}, + ) + ).assign({"foo": ("geoms", [1, 2])}) + + for ds in (geometry_ds[1], polygon_geometry.to_dataset(), geom_dim_ds): + roundtripped = decode_geometries(encode_geometries(ds)) + xr.testing.assert_identical(ds, roundtripped) diff --git a/doc/api.rst b/doc/api.rst index 8a9060dc..1d4b659c 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -17,6 +17,16 @@ Top-level API encode_multi_index_as_compress decode_compress_to_multi_index +Geometries +---------- +.. autosummary:: + :toctree: generated/ + + geometry.decode_geometries + geometry.encode_geometries + geometry.shapely_to_cf + geometry.cf_to_shapely + .. currentmodule:: xarray DataArray diff --git a/doc/coding.md b/doc/coding.md index 2ea05a69..6183781d 100644 --- a/doc/coding.md +++ b/doc/coding.md @@ -26,6 +26,10 @@ xr.set_options(display_expand_data=False) `cf_xarray` aims to support encoding and decoding variables using CF conventions not yet implemented by Xarray. +## Geometries + +See [](./geometry.md) for more. + ## Compression by gathering The ["compression by gathering"](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering) diff --git a/doc/geometry.md b/doc/geometry.md index 9ea702eb..ac840139 100644 --- a/doc/geometry.md +++ b/doc/geometry.md @@ -8,15 +8,13 @@ kernelspec: --- ```{eval-rst} -.. currentmodule:: xarray +.. currentmodule:: cf_xarray ``` # Geometries ```{seealso} -1. [The CF conventions on Geometries](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#geometries) -1. {py:func}`cf_xarray.shapely_to_cf` -1. {py:func}`cf_xarray.cf_to_shapely` +[The CF conventions on Geometries](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#geometries) ``` `cf_xarray` can convert between vector geometries represented as shapely objects @@ -42,6 +40,7 @@ da = xr.DataArray( dims=("index",), name="geometry" ) +da ``` ```{warning} @@ -50,7 +49,30 @@ da = xr.DataArray( geometries of the same type. ``` -Now we can take that `xr.DataArray` containing shapely geometries and convert it to cf: +## Encode / Decode + +`cf-xarray` provides {py:func}`geometry.encode_geometries` and {py:func}`geometry.decode_geometries` to +encode and decode xarray Datasets to/from a CF-compliant form that can be written to any array storage format. + +For example: + +```{code-cell} +ds = da.to_dataset() +encoded = cfxr.geometry.encode_geometries(ds) +encoded +``` + +This dataset can then be written to any format supported by Xarray. +To decode, reverse the process using {py:func}`geometry.decode_geometries` + +```{code-cell} +decoded = cfxr.geometry.decode_geometries(encoded) +ds.identical(decoded) +``` + +## Lower-level conversions + +Encoding a single DataArray is possible using {py:func}`geometry.shapely_to_cf`. ```{code-cell} ds_cf = cfxr.shapely_to_cf(da) @@ -94,6 +116,6 @@ By default these are called `'crd_x'` and `'crd_y'` unless `grid_mapping` is spe For MultiPolygons with holes the CF notation is slightly ambiguous on which hole is associated with which polygon. This is problematic because shapely stores holes within the polygon -object that they are associated with. `cf_xarray` assumes that the the shapes are interleaved +object that they are associated with. `cf_xarray` assumes that the shapes are interleaved such that the holes (interior rings) are associated with the exteriors (exterior rings) that immediately precede them. From cb7f5d0227d9071df7d9210b0d5b424cf2adc784 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 24 Jun 2024 10:57:07 -0600 Subject: [PATCH 2/2] More geometry functionality 1. Associate geometry vars as coordinates when we can 2. Add `cf.geometries` 3. Geometries in repr 4. Allow indexing by `"geometry"` or any geometry type. --- cf_xarray/accessor.py | 140 ++++++++++++++++++++++++------- cf_xarray/criteria.py | 9 ++ cf_xarray/datasets.py | 29 +++++++ cf_xarray/formatting.py | 11 +++ cf_xarray/geometry.py | 4 +- cf_xarray/tests/conftest.py | 55 ++++++++++++ cf_xarray/tests/test_accessor.py | 22 +++++ cf_xarray/tests/test_geometry.py | 45 +--------- doc/api.rst | 1 + doc/geometry.md | 82 ++++++++++++------ 10 files changed, 301 insertions(+), 97 deletions(-) create mode 100644 cf_xarray/tests/conftest.py diff --git a/cf_xarray/accessor.py b/cf_xarray/accessor.py index 3f5f2976..a915335b 100644 --- a/cf_xarray/accessor.py +++ b/cf_xarray/accessor.py @@ -28,8 +28,10 @@ from . import sgrid from .criteria import ( _DSG_ROLES, + _GEOMETRY_TYPES, cf_role_criteria, coordinate_criteria, + geometry_var_criteria, grid_mapping_var_criteria, regex, ) @@ -39,6 +41,7 @@ _format_data_vars, _format_dsg_roles, _format_flags, + _format_geometries, _format_sgrid, _maybe_panel, ) @@ -198,7 +201,9 @@ def _get_groupby_time_accessor( def _get_custom_criteria( - obj: DataArray | Dataset, key: Hashable, criteria: Mapping | None = None + obj: DataArray | Dataset, + key: Hashable, + criteria: Iterable[Mapping] | Mapping | None = None, ) -> list[Hashable]: """ Translate from axis, coord, or custom name to variable name. @@ -227,18 +232,16 @@ def _get_custom_criteria( except ImportError: from re import match as regex_match # type: ignore[no-redef] - if isinstance(obj, DataArray): - obj = obj._to_temp_dataset() - variables = obj._variables - if criteria is None: if not OPTIONS["custom_criteria"]: return [] criteria = OPTIONS["custom_criteria"] - if criteria is not None: - criteria_iter = always_iterable(criteria, allowed=(tuple, list, set)) + if isinstance(obj, DataArray): + obj = obj._to_temp_dataset() + variables = obj._variables + criteria_iter = always_iterable(criteria, allowed=(tuple, list, set)) criteria_map = ChainMap(*criteria_iter) results: set = set() if key in criteria_map: @@ -367,6 +370,21 @@ def _get_measure(obj: DataArray | Dataset, key: str) -> list[str]: return list(results) +def _parse_related_geometry_vars(attrs: Mapping) -> tuple[Hashable]: + names = itertools.chain( + *[ + attrs.get(attr, "").split(" ") + for attr in [ + "interior_ring", + "node_coordinates", + "node_count", + "part_node_count", + ] + ] + ) + return tuple(n for n in names if n) + + def _get_bounds(obj: DataArray | Dataset, key: Hashable) -> list[Hashable]: """ Translate from key (either CF key or variable name) to its bounds' variable names. @@ -470,8 +488,14 @@ def _get_all(obj: DataArray | Dataset, key: Hashable) -> list[Hashable]: """ all_mappers: tuple[Mapper] = ( _get_custom_criteria, - functools.partial(_get_custom_criteria, criteria=cf_role_criteria), # type: ignore[assignment] - functools.partial(_get_custom_criteria, criteria=grid_mapping_var_criteria), + functools.partial( + _get_custom_criteria, + criteria=( + cf_role_criteria, + grid_mapping_var_criteria, + geometry_var_criteria, + ), + ), # type: ignore[assignment] _get_axis_coord, _get_measure, _get_grid_mapping_name, @@ -821,6 +845,23 @@ def check_results(names, key): successful[k] = bool(grid_mapping) if grid_mapping: varnames.extend(grid_mapping) + elif "geometries" not in skip and (k == "geometry" or k in _GEOMETRY_TYPES): + geometries = _get_all(obj, k) + if geometries and k in _GEOMETRY_TYPES: + new = itertools.chain( + _parse_related_geometry_vars( + ChainMap(obj[g].attrs, obj[g].encoding) + ) + for g in geometries + ) + geometries.extend(*new) + if len(geometries) > 1 and scalar_key: + raise ValueError( + f"CF geometries must be represented by an Xarray Dataset. To request a Dataset in return please pass `[{k!r}]` instead." + ) + successful[k] = bool(geometries) + if geometries: + varnames.extend(geometries) elif k in custom_criteria or k in cf_role_criteria: names = _get_all(obj, k) check_results(names, k) @@ -1559,8 +1600,7 @@ def _generate_repr(self, rich=False): _format_flags(self, rich), title="Flag Variable", rich=rich ) - roles = self.cf_roles - if roles: + if roles := self.cf_roles: if any(role in roles for role in _DSG_ROLES): yield _maybe_panel( _format_dsg_roles(self, dims, rich), @@ -1576,6 +1616,13 @@ def _generate_repr(self, rich=False): rich=rich, ) + if self.geometries: + yield _maybe_panel( + _format_geometries(self, dims, rich), + title="Geometries", + rich=rich, + ) + yield _maybe_panel( _format_coordinates(self, dims, coords, rich), title="Coordinates", @@ -1755,12 +1802,42 @@ def cf_roles(self) -> dict[str, list[Hashable]]: vardict: dict[str, list[Hashable]] = {} for k, v in variables.items(): - if "cf_role" in v.attrs: - role = v.attrs["cf_role"] + attrs_or_encoding = ChainMap(v.attrs, v.encoding) + if role := attrs_or_encoding.get("cf_role", None): vardict[role] = vardict.setdefault(role, []) + [k] return {role_: sort_maybe_hashable(v) for role_, v in vardict.items()} + @property + def geometries(self) -> dict[str, list[Hashable]]: + """ + Mapping geometry type names to variable names. + + Returns + ------- + dict + Dictionary mapping geometry names to variable names. + + References + ---------- + Please refer to the CF conventions document : http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#coordinates-metadata + """ + vardict: dict[str, list[Hashable]] = {} + + if isinstance(self._obj, Dataset): + variables = self._obj._variables + elif isinstance(self._obj, DataArray): + variables = {"_": self._obj._variable} + + for v in variables.values(): + attrs_or_encoding = ChainMap(v.attrs, v.encoding) + if geometry := attrs_or_encoding.get("geometry", None): + gtype = self._obj[geometry].attrs["geometry_type"] + vardict.setdefault(gtype, []) + if geometry not in vardict[gtype]: + vardict[gtype] += [geometry] + return {type_: sort_maybe_hashable(v) for type_, v in vardict.items()} + def get_associated_variable_names( self, name: Hashable, skip_bounds: bool = False, error: bool = True ) -> dict[str, list[Hashable]]: @@ -1795,15 +1872,15 @@ def get_associated_variable_names( "bounds", "grid_mapping", "grid", + "geometry", ] coords: dict[str, list[Hashable]] = {k: [] for k in keys} attrs_or_encoding = ChainMap(self._obj[name].attrs, self._obj[name].encoding) - coordinates = attrs_or_encoding.get("coordinates", None) # Handles case where the coordinates attribute is None # This is used to tell xarray to not write a coordinates attribute - if coordinates: + if coordinates := attrs_or_encoding.get("coordinates", None): coords["coordinates"] = coordinates.split(" ") if "cell_measures" in attrs_or_encoding: @@ -1822,27 +1899,32 @@ def get_associated_variable_names( ) coords["cell_measures"] = [] - if ( - isinstance(self._obj, Dataset) - and "ancillary_variables" in attrs_or_encoding + if isinstance(self._obj, Dataset) and ( + anc := attrs_or_encoding.get("ancillary_variables", None) ): - coords["ancillary_variables"] = attrs_or_encoding[ - "ancillary_variables" - ].split(" ") + coords["ancillary_variables"] = anc.split(" ") if not skip_bounds: - if "bounds" in attrs_or_encoding: - coords["bounds"] = [attrs_or_encoding["bounds"]] + if bounds := attrs_or_encoding.get("bounds", None): + coords["bounds"] = [bounds] for dim in self._obj[name].dims: - dbounds = self._obj[dim].attrs.get("bounds", None) - if dbounds: + if dbounds := self._obj[dim].attrs.get("bounds", None): coords["bounds"].append(dbounds) - if "grid" in attrs_or_encoding: - coords["grid"] = [attrs_or_encoding["grid"]] + for attrname in ["grid", "grid_mapping"]: + if maybe := attrs_or_encoding.get(attrname, None): + coords[attrname] = [maybe] - if "grid_mapping" in attrs_or_encoding: - coords["grid_mapping"] = [attrs_or_encoding["grid_mapping"]] + more: Sequence[Hashable] = () + if geometry_var := attrs_or_encoding.get("geometry", None): + coords["geometry"] = [geometry_var] + _attrs = ChainMap( + self._obj[geometry_var].attrs, self._obj[geometry_var].encoding + ) + more = _parse_related_geometry_vars(_attrs) + elif "geometry_type" in attrs_or_encoding: + more = _parse_related_geometry_vars(attrs_or_encoding) + coords["geometry"].extend(more) allvars = itertools.chain(*coords.values()) missing = set(allvars) - set(self._maybe_to_dataset()._variables) diff --git a/cf_xarray/criteria.py b/cf_xarray/criteria.py index 76299520..81b78ab8 100644 --- a/cf_xarray/criteria.py +++ b/cf_xarray/criteria.py @@ -12,7 +12,10 @@ from collections.abc import Mapping, MutableMapping from typing import Any +#: CF Roles understood by cf-xarray _DSG_ROLES = ["timeseries_id", "profile_id", "trajectory_id"] +#: Geometry types understood by cf-xarray +_GEOMETRY_TYPES = ("line", "point", "polygon") cf_role_criteria: Mapping[str, Mapping[str, str]] = { k: {"cf_role": k} @@ -31,6 +34,12 @@ "grid_mapping": {"grid_mapping_name": re.compile(".")} } +# A geometry container is anything with a geometry_type attribute +geometry_var_criteria: Mapping[str, Mapping[str, Any]] = { + "geometry": {"geometry_type": re.compile(".")}, + **{k: {"geometry_type": k} for k in _GEOMETRY_TYPES}, +} + coordinate_criteria: MutableMapping[str, MutableMapping[str, tuple]] = { "latitude": { "standard_name": ("latitude",), diff --git a/cf_xarray/datasets.py b/cf_xarray/datasets.py index d1125706..997ba987 100644 --- a/cf_xarray/datasets.py +++ b/cf_xarray/datasets.py @@ -748,3 +748,32 @@ def _create_inexact_bounds(): node_coordinates="node_lon node_lat node_elevation", ), ) + + +def point_dataset(): + from shapely.geometry import MultiPoint, Point + + da = xr.DataArray( + [ + MultiPoint([(1.0, 2.0), (2.0, 3.0)]), + Point(3.0, 4.0), + Point(4.0, 5.0), + Point(3.0, 4.0), + ], + dims=("index",), + name="geometry", + ) + ds = da.to_dataset() + return ds + + +def encoded_point_dataset(): + from .geometry import encode_geometries + + ds = encode_geometries(point_dataset()) + ds["data"] = ( + "index", + np.arange(ds.sizes["index"]), + {"geometry": "geometry_container"}, + ) + return ds diff --git a/cf_xarray/formatting.py b/cf_xarray/formatting.py index f99e95bc..4fe58ccb 100644 --- a/cf_xarray/formatting.py +++ b/cf_xarray/formatting.py @@ -295,6 +295,17 @@ def _format_dsg_roles(accessor, dims, rich): ) +def _format_geometries(accessor, dims, rich): + yield make_text_section( + accessor, + "CF Geometries", + "geometries", + dims=dims, + # valid_keys=_DSG_ROLES, + rich=rich, + ) + + def _format_coordinates(accessor, dims, coords, rich): from .accessor import _AXIS_NAMES, _CELL_MEASURES, _COORD_NAMES diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index 1e36e941..760436f7 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -159,6 +159,8 @@ def encode_geometries(ds: xr.Dataset): for varname, var in ds._variables.items(): if varname == name: continue + # TODO: this is incomplete. It works for vector data cubes where one of the geometry vars + # is a dimension coordinate. if name in var.dims: var = var.copy() var._attrs = copy.deepcopy(var._attrs) @@ -244,7 +246,7 @@ def reshape_unique_geometries( out[geom_var] = ds[geom_var].isel({old_name: unique_indexes}) if old_name not in ds.coords: # If there was no coord before, drop the dummy one we made. - out = out.drop_vars(old_name) + out = out.drop_vars(old_name) # type: ignore[arg-type,unused-ignore] # Hashable/str stuff return out diff --git a/cf_xarray/tests/conftest.py b/cf_xarray/tests/conftest.py new file mode 100644 index 00000000..7518817e --- /dev/null +++ b/cf_xarray/tests/conftest.py @@ -0,0 +1,55 @@ +import numpy as np +import pytest +import xarray as xr + + +@pytest.fixture(scope="session") +def geometry_ds(): + pytest.importorskip("shapely") + + from shapely.geometry import MultiPoint, Point + + # empty/fill workaround to avoid numpy deprecation(warning) due to the array interface of shapely geometries. + geoms = np.empty(4, dtype=object) + geoms[:] = [ + MultiPoint([(1.0, 2.0), (2.0, 3.0)]), + Point(3.0, 4.0), + Point(4.0, 5.0), + Point(3.0, 4.0), + ] + + ds = xr.Dataset( + { + "data": xr.DataArray( + range(len(geoms)), + dims=("index",), + attrs={ + "coordinates": "crd_x crd_y", + }, + ), + "time": xr.DataArray([0, 0, 0, 1], dims=("index",)), + } + ) + shp_ds = ds.assign(geometry=xr.DataArray(geoms, dims=("index",))) + # Here, since it should not be present in shp_ds + ds.data.attrs["geometry"] = "geometry_container" + + cf_ds = ds.assign( + x=xr.DataArray([1.0, 2.0, 3.0, 4.0, 3.0], dims=("node",), attrs={"axis": "X"}), + y=xr.DataArray([2.0, 3.0, 4.0, 5.0, 4.0], dims=("node",), attrs={"axis": "Y"}), + node_count=xr.DataArray([2, 1, 1, 1], dims=("index",)), + crd_x=xr.DataArray([1.0, 3.0, 4.0, 3.0], dims=("index",), attrs={"nodes": "x"}), + crd_y=xr.DataArray([2.0, 4.0, 5.0, 4.0], dims=("index",), attrs={"nodes": "y"}), + geometry_container=xr.DataArray( + attrs={ + "geometry_type": "point", + "node_count": "node_count", + "node_coordinates": "x y", + "coordinates": "crd_x crd_y", + } + ), + ) + + cf_ds = cf_ds.set_coords(["x", "y", "crd_x", "crd_y"]) + + return cf_ds, shp_ds diff --git a/cf_xarray/tests/test_accessor.py b/cf_xarray/tests/test_accessor.py index 147e9686..5cc58688 100644 --- a/cf_xarray/tests/test_accessor.py +++ b/cf_xarray/tests/test_accessor.py @@ -2076,3 +2076,25 @@ def test_ancillary_variables_extra_dim(): } ) assert_identical(ds.cf["X"], ds["x"]) + + +def test_geometry_association(geometry_ds): + cf_ds, _ = geometry_ds + actual = cf_ds.cf[["data"]] + for name in ["geometry_container", "x", "y", "node_count", "crd_x", "crd_y"]: + assert name in actual.coords + + actual = cf_ds.cf["data"] + for name in ["geometry_container", "node_count", "crd_x", "crd_y"]: + assert name in actual.coords + + assert cf_ds.cf.geometries == {"point": ["geometry_container"]} + assert_identical(cf_ds.cf["geometry"], cf_ds["geometry_container"]) + with pytest.raises(ValueError): + cf_ds.cf["point"] + + expected = cf_ds[["geometry_container", "node_count", "x", "y", "crd_x", "crd_y"]] + assert_identical( + cf_ds.cf[["point"]], + expected.set_coords(["node_count", "x", "y", "crd_x", "crd_y"]), + ) diff --git a/cf_xarray/tests/test_geometry.py b/cf_xarray/tests/test_geometry.py index 374bb21c..b026f23e 100644 --- a/cf_xarray/tests/test_geometry.py +++ b/cf_xarray/tests/test_geometry.py @@ -21,48 +21,6 @@ def polygon_geometry() -> xr.DataArray: return xr.DataArray(geoms, dims=("index",), name="geometry") -@pytest.fixture -def geometry_ds(): - from shapely.geometry import MultiPoint, Point - - # empty/fill workaround to avoid numpy deprecation(warning) due to the array interface of shapely geometries. - geoms = np.empty(4, dtype=object) - geoms[:] = [ - MultiPoint([(1.0, 2.0), (2.0, 3.0)]), - Point(3.0, 4.0), - Point(4.0, 5.0), - Point(3.0, 4.0), - ] - - ds = xr.Dataset( - { - "data": xr.DataArray(range(len(geoms)), dims=("index",)), - "time": xr.DataArray([0, 0, 0, 1], dims=("index",)), - } - ) - shp_ds = ds.assign(geometry=xr.DataArray(geoms, dims=("index",))) - - cf_ds = ds.assign( - x=xr.DataArray([1.0, 2.0, 3.0, 4.0, 3.0], dims=("node",), attrs={"axis": "X"}), - y=xr.DataArray([2.0, 3.0, 4.0, 5.0, 4.0], dims=("node",), attrs={"axis": "Y"}), - node_count=xr.DataArray([2, 1, 1, 1], dims=("index",)), - crd_x=xr.DataArray([1.0, 3.0, 4.0, 3.0], dims=("index",), attrs={"nodes": "x"}), - crd_y=xr.DataArray([2.0, 4.0, 5.0, 4.0], dims=("index",), attrs={"nodes": "y"}), - geometry_container=xr.DataArray( - attrs={ - "geometry_type": "point", - "node_count": "node_count", - "node_coordinates": "x y", - "coordinates": "crd_x crd_y", - } - ), - ) - - cf_ds = cf_ds.set_coords(["x", "y", "crd_x", "crd_y"]) - - return cf_ds, shp_ds - - @pytest.fixture def geometry_line_ds(): from shapely.geometry import LineString, MultiLineString @@ -284,8 +242,11 @@ def test_shapely_to_cf(geometry_ds): from shapely.geometry import Point expected, in_ds = geometry_ds + expected = expected.copy(deep=True) + # This isn't really a roundtrip test out = xr.merge([in_ds.drop_vars("geometry"), cfxr.shapely_to_cf(in_ds.geometry)]) + del expected.data.attrs["geometry"] xr.testing.assert_identical(out, expected) out = xr.merge( diff --git a/doc/api.rst b/doc/api.rst index 1d4b659c..4651f017 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -106,6 +106,7 @@ Attributes Dataset.cf.coordinates Dataset.cf.formula_terms Dataset.cf.grid_mapping_names + Dataset.cf.geometries Dataset.cf.standard_names .. _dsmeth: diff --git a/doc/geometry.md b/doc/geometry.md index ac840139..3a2380f8 100644 --- a/doc/geometry.md +++ b/doc/geometry.md @@ -8,73 +8,105 @@ kernelspec: --- ```{eval-rst} -.. currentmodule:: cf_xarray +.. currentmodule:: xarray ``` # Geometries ```{seealso} -[The CF conventions on Geometries](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#geometries) +1. [The CF conventions on Geometries](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#geometries) +1. {py:attr}`Dataset.cf.geometries` ``` -`cf_xarray` can convert between vector geometries represented as shapely objects -and CF-compliant array representations of those geometries. +```{eval-rst} +.. currentmodule:: cf_xarray +``` -Let's start by creating an xarray object containing some shapely geometries. This example uses -a `xr.DataArray` but these functions also work with a `xr.Dataset` where one of the data variables -contains an array of shapes. +First read an example dataset with CF-encoded geometries ```{code-cell} import cf_xarray as cfxr +import cf_xarray.datasets import xarray as xr -from shapely.geometry import MultiPoint, Point - -da = xr.DataArray( - [ - MultiPoint([(1.0, 2.0), (2.0, 3.0)]), - Point(3.0, 4.0), - Point(4.0, 5.0), - Point(3.0, 4.0), - ], - dims=("index",), - name="geometry" -) -da +ds = cfxr.datasets.encoded_point_dataset() +ds +``` + +The {py:attr}`Dataset.cf.geometries` property will yield a mapping from geometry type to geometry container variable name. + +```{code-cell} +ds.cf.geometries +``` + +The `"geometry"` name is special, and will return the geometry *container* present in the dataset + +```{code-cell} +ds.cf["geometry"] +``` + +Request all variables needed to represent a geometry as a Dataset using the geometry type as key. + +```{code-cell} +ds.cf[["point"]] ``` +You *must* request a Dataset as return type, that is provide the list `["point]`, because the CF conventions encode geometries across multiple variables with dimensions that are not present on all variables. Xarray's data model does *not* allow representing such a collection of variables as a DataArray. + +## Encoding & decoding + +`cf_xarray` can convert between vector geometries represented as shapely objects +and CF-compliant array representations of those geometries. + +Let's start by creating an xarray object containing some shapely geometries. This example uses +a `xr.DataArray` but these functions also work with a `xr.Dataset` where one of the data variables +contains an array of shapes. + ```{warning} `cf_xarray` does not support handle multiple types of shapes (Point, Line, Polygon) in one `xr.DataArray`, but multipart geometries are supported and can be mixed with single-part geometries of the same type. ``` -## Encode / Decode - `cf-xarray` provides {py:func}`geometry.encode_geometries` and {py:func}`geometry.decode_geometries` to encode and decode xarray Datasets to/from a CF-compliant form that can be written to any array storage format. -For example: +For example, here is a Dataset with shapely geometries + +```{code-cell} +ds = cfxr.datasets.point_dataset() +ds +``` + +Encode with the CF-conventions ```{code-cell} -ds = da.to_dataset() encoded = cfxr.geometry.encode_geometries(ds) encoded ``` This dataset can then be written to any format supported by Xarray. -To decode, reverse the process using {py:func}`geometry.decode_geometries` +To decode back to shapely geometries, reverse the process using {py:func}`geometry.decode_geometries` ```{code-cell} decoded = cfxr.geometry.decode_geometries(encoded) ds.identical(decoded) ``` +### Limitations + +The following limitations can be relaxed in the future. PRs welcome! + +1. cf-xarray uses `"geometry_container"` as the name for the geometry variable always +1. cf-xarray only supports decoding a single geometry in a Dataset. +1. CF xarray will not set the `"geometry"` attribute that links a variable to a geometry by default unless the geometry variable is a dimension coordiante for that variable. This heuristic works OK for vector data cubes (e.g. [xvec](https://xvec.readthedocs.io/en/stable/)). You should set the `"geometry"` attribute manually otherwise. Suggestions for better behaviour here are very welcome. + ## Lower-level conversions Encoding a single DataArray is possible using {py:func}`geometry.shapely_to_cf`. ```{code-cell} +da = ds["geometry"] ds_cf = cfxr.shapely_to_cf(da) ds_cf ```