diff --git a/.gitignore b/.gitignore index 8ddc68ae..d88bffc0 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,6 @@ var build dist/ target/ +*.DS_Store +*/.DS_Store docs/build/ diff --git a/.isort.cfg b/.isort.cfg index fec62009..9accadcc 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,5 +1,6 @@ [settings] -known_third_party = dask,numcodecs,numpy,pytest,scipy,setuptools,skimage,zarr + +known_third_party = cached_path,dask,jsonschema,numcodecs,numpy,pytest,scipy,setuptools,skimage,zarr multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 diff --git a/environment.yml b/environment.yml index 2dbb7d64..01b20d24 100644 --- a/environment.yml +++ b/environment.yml @@ -4,8 +4,10 @@ channels: - conda-forge - defaults dependencies: + - cached_path - flake8 - ipython + - jsonschema - mypy - omero-py - pip diff --git a/ome_zarr/cli.py b/ome_zarr/cli.py index 9b778d13..6f2e6f0a 100755 --- a/ome_zarr/cli.py +++ b/ome_zarr/cli.py @@ -9,6 +9,7 @@ from .scale import Scaler from .utils import download as zarr_download from .utils import info as zarr_info +from .utils import validate as zarr_validate def config_logging(loglevel: int, args: argparse.Namespace) -> None: @@ -29,6 +30,12 @@ def info(args: argparse.Namespace) -> None: list(zarr_info(args.path, stats=args.stats)) +def validate(args: argparse.Namespace) -> None: + """Wrap the :func:`~ome_zarr.utils.validate` method.""" + config_logging(logging.WARN, args) + list(zarr_validate(args.path, args.strict, args.clear_cache)) + + def download(args: argparse.Namespace) -> None: """Wrap the :func:`~ome_zarr.utils.download` method.""" config_logging(logging.WARN, args) @@ -99,6 +106,19 @@ def main(args: List[str] = None) -> None: parser_info.add_argument("--stats", action="store_true") parser_info.set_defaults(func=info) + # validate + parser_validate = subparsers.add_parser("validate") + parser_validate.add_argument("path") + parser_validate.add_argument( + "--strict", action="store_true", help="validate using a strict schema" + ) + parser_validate.add_argument( + "--clear_cache", + action="store_true", + help="Remove any cached schemas to force reload", + ) + parser_validate.set_defaults(func=validate) + # download parser_download = subparsers.add_parser("download") parser_download.add_argument("path") diff --git a/ome_zarr/data.py b/ome_zarr/data.py index d669459a..81500dcd 100644 --- a/ome_zarr/data.py +++ b/ome_zarr/data.py @@ -148,19 +148,19 @@ def create_zarr( "channels": [ { "color": "FF0000", - "window": {"start": 0, "end": 255}, + "window": {"start": 0, "end": 255, "min": 0, "max": 255}, "label": "Red", "active": True, }, { "color": "00FF00", - "window": {"start": 0, "end": 255}, + "window": {"start": 0, "end": 255, "min": 0, "max": 255}, "label": "Green", "active": True, }, { "color": "0000FF", - "window": {"start": 0, "end": 255}, + "window": {"start": 0, "end": 255, "min": 0, "max": 255}, "label": "Blue", "active": True, }, diff --git a/ome_zarr/format.py b/ome_zarr/format.py index 5bca76de..959cd47e 100644 --- a/ome_zarr/format.py +++ b/ome_zarr/format.py @@ -64,7 +64,7 @@ def init_store(self, path: str, mode: str = "r") -> FSStore: def init_channels(self) -> None: # pragma: no cover raise NotImplementedError() - def _get_metadata_version(self, metadata: dict) -> Optional[str]: + def get_metadata_version(self, metadata: dict) -> Optional[str]: """ Checks the metadata dict for a version @@ -127,7 +127,7 @@ def version(self) -> str: return "0.1" def matches(self, metadata: dict) -> bool: - version = self._get_metadata_version(metadata) + version = self.get_metadata_version(metadata) LOGGER.debug(f"{self.version} matches {version}?") return version == self.version diff --git a/ome_zarr/reader.py b/ome_zarr/reader.py index c77e8162..381b98b6 100644 --- a/ome_zarr/reader.py +++ b/ome_zarr/reader.py @@ -1,5 +1,6 @@ """Reading logic for ome-zarr.""" +import json import logging import math from abc import ABC @@ -7,16 +8,29 @@ import dask.array as da import numpy as np +from cached_path import cached_path from dask import delayed +from jsonschema import Draft202012Validator as Validator +from jsonschema import RefResolver +from jsonschema import validate as jsonschema_validate from .axes import Axes -from .format import format_from_version +from .format import CurrentFormat, detect_format, format_from_version from .io import ZarrLocation from .types import JSONDict LOGGER = logging.getLogger("ome_zarr.reader") +def get_schema(name: str, version: str, strict: bool = False) -> Dict: + pre = "strict_" if strict else "" + schema_url = f"https://ngff.openmicroscopy.org/{version}/schemas/{pre}{name}.schema" + local_path = cached_path(schema_url) + with open(local_path) as f: + sch_string = f.read() + return json.loads(sch_string) + + class Node: """Container for a representation of the binary data somewhere in the data hierarchy.""" @@ -106,6 +120,12 @@ def load(self, spec_type: Type["Spec"]) -> Optional["Spec"]: return spec return None + def validate(self, strict: bool) -> None: + # Validation for a node is delegated to each spec + # e.g. Labels may have spec for multiscales and labels + for spec in self.specs: + spec.validate(strict) + def add( self, zarr: ZarrLocation, @@ -162,6 +182,9 @@ class Spec(ABC): Multiple subclasses may apply. """ + SCHEMA_NAME: str + version: str + @staticmethod def matches(zarr: ZarrLocation) -> bool: raise NotImplementedError() @@ -169,6 +192,9 @@ def matches(zarr: ZarrLocation) -> bool: def __init__(self, node: Node) -> None: self.node = node self.zarr = node.zarr + fmt = detect_format(self.zarr.root_attrs, CurrentFormat()) + version = fmt.get_metadata_version(self.zarr.root_attrs) + self.version = version if version is not None else fmt.version LOGGER.debug(f"treating {self.zarr} as {self.__class__.__name__}") for k, v in self.zarr.root_attrs.items(): LOGGER.info("root_attr: %s", k) @@ -177,6 +203,35 @@ def __init__(self, node: Node) -> None: def lookup(self, key: str, default: Any) -> Any: return self.zarr.root_attrs.get(key, default) + def validate(self, strict: bool = False) -> None: + if not hasattr(self, "SCHEMA_NAME"): + LOGGER.info("No schema for %s" % self.zarr) + return + LOGGER.info("Validating Multiscales spec at: %s" % self.zarr) + schema = get_schema(self.SCHEMA_NAME, self.version) + + # Always do a validation with the MUST rules + # Will throw ValidationException if it fails + json_data = self.zarr.root_attrs + jsonschema_validate(instance=json_data, schema=schema) + + # If we're also checking for SHOULD rules, + # we want to iterate all errors and show as Warnings + if strict: + strict_schema = get_schema(self.SCHEMA_NAME, self.version, strict=True) + if strict_schema is None: + return + # we only need this store to allow use of cached schemas + # (and potential off-line use) + schema_store = { + schema["$id"]: schema, + strict_schema["$id"]: strict_schema, + } + resolver = RefResolver.from_schema(strict_schema, store=schema_store) + validator = Validator(strict_schema, resolver=resolver) + for error in validator.iter_errors(json_data): + LOGGER.warn(error.message) + class Labels(Spec): """Relatively small specification for the well-known "labels" group which only @@ -266,6 +321,9 @@ def __init__(self, node: Node) -> None: class Multiscales(Spec): + + SCHEMA_NAME = "image" + @staticmethod def matches(zarr: ZarrLocation) -> bool: """is multiscales metadata present?""" @@ -279,12 +337,9 @@ def __init__(self, node: Node) -> None: try: multiscales = self.lookup("multiscales", []) - version = multiscales[0].get( - "version", "0.1" - ) # should this be matched with Format.version? datasets = multiscales[0]["datasets"] axes = multiscales[0].get("axes") - fmt = format_from_version(version) + fmt = format_from_version(self.version) # Raises ValueError if not valid axes_obj = Axes(axes, fmt) node.metadata["axes"] = axes_obj.to_list() @@ -299,7 +354,7 @@ def __init__(self, node: Node) -> None: return # EARLY EXIT for resolution in self.datasets: - data: da.core.Array = self.array(resolution, version) + data: da.core.Array = self.array(resolution, self.version) chunk_sizes = [ str(c[0]) + (" (+ %s)" % c[-1] if c[-1] != c[0] else "") for c in data.chunks @@ -395,6 +450,9 @@ def __init__(self, node: Node) -> None: class Well(Spec): + + SCHEMA_NAME = "well" + @staticmethod def matches(zarr: ZarrLocation) -> bool: return bool("well" in zarr.root_attrs) @@ -467,6 +525,9 @@ def get_lazy_well(level: int, tile_shape: tuple) -> da.Array: class Plate(Spec): + + SCHEMA_NAME = "plate" + @staticmethod def matches(zarr: ZarrLocation) -> bool: return bool("plate" in zarr.root_attrs) @@ -474,6 +535,8 @@ def matches(zarr: ZarrLocation) -> bool: def __init__(self, node: Node) -> None: super().__init__(node) LOGGER.debug(f"Plate created with ZarrLocation fmt:{ self.zarr.fmt}") + self.plate_data = self.lookup("plate", {}) + LOGGER.info("plate_data: %s", self.plate_data) self.get_pyramid_lazy(node) def get_pyramid_lazy(self, node: Node) -> None: @@ -481,8 +544,6 @@ def get_pyramid_lazy(self, node: Node) -> None: Return a pyramid of dask data, where the highest resolution is the stitched full-resolution images. """ - self.plate_data = self.lookup("plate", {}) - LOGGER.info("plate_data: %s", self.plate_data) self.rows = self.plate_data.get("rows") self.columns = self.plate_data.get("columns") self.first_field = "0" diff --git a/ome_zarr/utils.py b/ome_zarr/utils.py index 21aa92b9..5985e2a0 100644 --- a/ome_zarr/utils.py +++ b/ome_zarr/utils.py @@ -2,12 +2,14 @@ import json import logging +import shutil from pathlib import Path -from typing import Iterator, List +from typing import Callable, Iterator, List import dask import dask.array as da import zarr +from cached_path import get_cache_dir from dask.diagnostics import ProgressBar from .io import parse_url @@ -17,21 +19,26 @@ LOGGER = logging.getLogger("ome_zarr.utils") -def info(path: str, stats: bool = False) -> Iterator[Node]: - """Print information about an OME-Zarr fileset. - - All :class:`Nodes ` that are found from the given path will - be visited recursively. - """ +def visit(path: str, func: Callable) -> Iterator[Node]: + """Call func(node) for each node read from path.""" zarr = parse_url(path) assert zarr, f"not a zarr: {zarr}" reader = Reader(zarr) for node in reader(): - if not node.specs: print(f"not an ome-zarr node: {node}") continue + yield func(node) + + +def info(path: str, stats: bool = False) -> Iterator[Node]: + """Print information about an OME-Zarr fileset. + + All :class:`Nodes ` that are found from the given path will + be visited recursively. + """ + def func(node: Node) -> Node: print(node) print(" - metadata") for spec in node.specs: @@ -43,7 +50,29 @@ def info(path: str, stats: bool = False) -> Iterator[Node]: minmax = f" minmax={dask.compute(array.min(), array.max())}" print(f" - {array.shape}{minmax}") LOGGER.debug(node.data) - yield node + return node + + return visit(path, func) + + +def validate(path: str, strict: bool, clear_cache: bool = False) -> Iterator[Node]: + """ + Validate OME-NGFF data + + All :class:`Nodes ` that are found from the given path will + be visited recursively. + """ + + if clear_cache: + dir_path = get_cache_dir() + shutil.rmtree(dir_path, ignore_errors=True) + + def func(node: Node) -> Node: + if hasattr(node, "validate"): + node.validate(strict) + return node + + return visit(path, func) def download(input_path: str, output_dir: str = ".") -> None: diff --git a/setup.py b/setup.py index 9eef2c22..43e60235 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,8 @@ def read(fname): install_requires += (["requests"],) install_requires += (["scikit-image"],) install_requires += (["toolz"],) +install_requires += (["jsonschema"],) +install_requires += (["cached_path"],) setup( diff --git a/tests/test_cli.py b/tests/test_cli.py index 3a0d91dd..dc8f99f9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -33,6 +33,15 @@ def test_astronaut_info(self): main(["create", "--method=astronaut", filename]) main(["info", filename]) + @pytest.mark.parametrize("strict", [False, True]) + def test_astronaut_validation(self, strict): + filename = str(self.path) + "-2" + main(["create", "--method=astronaut", filename]) + if strict: + main(["validate", "--strict", filename]) + else: + main(["validate", filename]) + def test_astronaut_download(self, tmpdir): out = str(tmpdir / "out") filename = str(self.path) + "-3"