diff --git a/pangeo_forge_runner/feedstock.py b/pangeo_forge_runner/feedstock.py index 491a8aa..f03501d 100644 --- a/pangeo_forge_runner/feedstock.py +++ b/pangeo_forge_runner/feedstock.py @@ -5,6 +5,7 @@ from ruamel.yaml import YAML +from .meta_yaml import MetaYaml from .recipe_rewriter import RecipeRewriter yaml = YAML() @@ -30,7 +31,7 @@ def __init__( """ self.feedstock_dir = feedstock_dir with open(self.feedstock_dir / "meta.yaml") as f: - self.meta = yaml.load(f) + self.meta = MetaYaml(**yaml.load(f)) self.prune = prune self.callable_args_injections = ( @@ -75,18 +76,17 @@ def parse_recipes(self): *Executes arbitrary code* defined in the feedstock recipes. """ recipes = {} - recipes_config = self.meta.get("recipes") - if isinstance(recipes_config, list): - for r in recipes_config: + if isinstance(self.meta.recipes, list): + for r in self.meta.recipes: recipes[r["id"]] = self._import(r["object"]) - elif isinstance(recipes_config, dict): - recipes = self._import(recipes_config["dict_object"]) + elif isinstance(self.meta.recipes, dict): + recipes = self._import(self.meta.recipes["dict_object"]) else: raise ValueError("Could not parse recipes config in meta.yaml") return recipes - def get_expanded_meta(self): + def get_expanded_meta(self, drop_none=True) -> dict: """ Return full meta.yaml file, expanding recipes if needed. @@ -94,11 +94,23 @@ def get_expanded_meta(self): 'object' keys *may* be present, but not guaranteed. """ meta_copy = deepcopy(self.meta) - if "recipes" in self.meta and "dict_object" in self.meta["recipes"]: + if self.meta.recipes and "dict_object" in self.meta.recipes: # We have a dict_object, so let's parse the recipes, and provide # keep just the ids, discarding the values - as the values do not # actually serialize. recipes = self.parse_recipes() - meta_copy["recipes"] = [{"id": k} for k, v in recipes.items()] - - return meta_copy + meta_copy.recipes = [ + # In place of the values, we add a placeholder string, so that the + # re-assignment to the MetaYaml schema here will pass validation + # of the `recipes` field, which requires "id" and "object" fields. + {"id": k, "object": "DICT_VALUE_PLACEHOLDER"} + for k, _ in recipes.items() + ] + + return ( + # the traitlets MetaYaml schema will give us empty containers + # by default, but in most cases lets assume we don't want that + {k: v for k, v in meta_copy.trait_values().items() if v} + if drop_none + else meta_copy.trait_values() + ) diff --git a/pangeo_forge_runner/meta_yaml.py b/pangeo_forge_runner/meta_yaml.py new file mode 100644 index 0000000..19ab404 --- /dev/null +++ b/pangeo_forge_runner/meta_yaml.py @@ -0,0 +1,110 @@ +import jsonschema +from traitlets import Dict, HasTraits, List, TraitError, Unicode, Union, validate + +recipes_field_per_element_schema = { + "type": "object", + "properties": { + "id": {"type": "string"}, + "object": {"type": "string"}, + }, + "required": ["id", "object"], +} + + +class MetaYaml(HasTraits): + """Schema for the ``meta.yaml`` file which must be included in each feedstock directory. + Only the ``recipes`` field is strictly required for ``pangeo-forge-runner`` to function. + All other fields are recommended but not required. + """ + + def __init__(self, recipes=None, **kwargs): + """The only required field is ``recipes``, so we put it explicitly in the init + signature to ensure it is not omitted, as demonstrated in: + https://github.com/ipython/traitlets/issues/490#issuecomment-479716288 + """ + super().__init__(**kwargs) + self.recipes = recipes + + @validate("recipes") + def _validate_recipes(self, proposal): + """Ensure the ``recipes`` trait is not passed as an empty container and that + each element of the field contains all expected subfields. + """ + if not proposal["value"]: + raise TraitError( + f"The ``recipes`` trait, passed as {proposal['value']}, cannot be empty." + ) + + if isinstance(proposal["value"], list): + for recipe_spec in proposal["value"]: + try: + jsonschema.validate(recipe_spec, recipes_field_per_element_schema) + except jsonschema.ValidationError as e: + raise TraitError(e) + return proposal["value"] + + title = Unicode( + allow_none=True, + help=""" + Title for this dataset. + """, + ) + description = Unicode( + allow_none=True, + help=""" + Description of the dataset. + """, + ) + recipes = Union( + [List(Dict()), Dict()], + help=""" + Specifies the deployable Python objects to run in the recipe module. + If the recipes are assigned to their own Python variable names, + should be of the form: + + ```yaml + recipes: + - id: "unique-identifier-for-recipe" + object: "recipe:transforms" + ``` + + Alternatively, if the recipes are values in a Python dict: + + ```yaml + recipes: + dict_object: "recipe:name_of_recipes_dict" + ``` + """, + ) + provenance = Dict( + allow_none=True, + help=""" + Dataset provenance information including provider, license, etc. + """, + per_key_traits={ + "providers": List( + Dict( + per_key_traits={ + "name": Unicode(), + "description": Unicode(), + "roles": List(), # TODO: enum + "url": Unicode(), + }, + ), + ), + "license": Unicode(), # TODO: guidance on suggested formatting (enum?) + }, + ) + maintainers = List( + Dict( + per_key_traits={ + "name": Unicode(help="Full name of the maintainer."), + "orcid": Unicode(help="Maintainer's ORCID ID"), # TODO: regex + "github": Unicode(help="Maintainer's GitHub username."), + }, + ), + allow_none=True, + help=""" + Maintainers of this Pangeo Forge feedstock. + """, + ) diff --git a/tests/unit/test_expand_meta.py b/tests/unit/test_expand_meta.py index 4beb35b..8e95367 100644 --- a/tests/unit/test_expand_meta.py +++ b/tests/unit/test_expand_meta.py @@ -10,8 +10,6 @@ "meta": { "title": "Global Precipitation Climatology Project", "description": "Global Precipitation Climatology Project (GPCP) Daily Version 1.3 gridded, merged ty satellite/gauge precipitation Climate data Record (CDR) from 1996 to present.\n", - "pangeo_forge_version": "0.9.0", - "pangeo_notebook_version": "2022.06.02", "recipes": [{"id": "gpcp-from-gcs", "object": "recipe:recipe"}], "provenance": { "providers": [ @@ -37,7 +35,6 @@ "github": "cisaacstern", } ], - "bakery": {"id": "pangeo-ldeo-nsf-earthcube"}, }, } ] diff --git a/tests/unit/test_feedstock.py b/tests/unit/test_feedstock.py new file mode 100644 index 0000000..a7ac78d --- /dev/null +++ b/tests/unit/test_feedstock.py @@ -0,0 +1,81 @@ +from textwrap import dedent + +import pytest +from ruamel.yaml import YAML + +from pangeo_forge_runner.feedstock import Feedstock +from pangeo_forge_runner.meta_yaml import MetaYaml + +yaml = YAML() + + +@pytest.fixture(params=["recipe_object", "dict_object"]) +def tmp_feedstock(request, tmp_path_factory: pytest.TempPathFactory): + tmpdir = tmp_path_factory.mktemp("feedstock") + if request.param == "recipe_object": + meta_yaml = dedent( + """\ + recipes: + - id: aws-noaa-sea-surface-temp-whoi + object: 'recipe:recipe' + """ + ) + recipe_py = dedent( + """\ + class Recipe: + pass + + recipe = Recipe() + """ + ) + elif request.param == "dict_object": + meta_yaml = dedent( + """\ + recipes: + dict_object: 'recipe:recipes' + """ + ) + recipe_py = dedent( + """\ + class Recipe: + pass + + recipes = {"my_recipe": Recipe()} + """ + ) + + with open(tmpdir / "meta.yaml", mode="w") as f: + f.write(meta_yaml) + with open(tmpdir / "recipe.py", mode="w") as f: + f.write(recipe_py) + + yield tmpdir, meta_yaml, request.param + + +def test_feedstock(tmp_feedstock): + tmpdir, meta_yaml, recipes_section_type = tmp_feedstock + f = Feedstock(feedstock_dir=tmpdir) + # equality of HasTraits instances doesn't work as I might expect, + # so just check equality of the relevant trait (`.recipes`) + assert f.meta.recipes == MetaYaml(**yaml.load(meta_yaml)).recipes + + expanded_meta = f.get_expanded_meta() + recipes = f.parse_recipes() + + for recipe_metadata in expanded_meta["recipes"]: + # the recipe_object metadata looks something like this: + # {'recipes': [{'id': 'my_recipe', 'object': 'DICT_VALUE_PLACEHOLDER'}]} + # and the dict_object metadata looks like this: + # {'recipes': [{'id': 'aws-noaa-sea-surface-temp-whoi', 'object': 'recipe:recipe'}]} + # both have an "id" field: + assert "id" in recipe_metadata + # but only the "recipe_object" has an "object" field: + if recipes_section_type == "recipe_object": + assert "object" in recipe_metadata + elif recipes_section_type == "dict_object": + assert recipe_metadata["object"] == "DICT_VALUE_PLACEHOLDER" + + for r in recipes.values(): + # the values of the recipes dict should all be python objects + # we used the mock type `Recipe` here, so this should be true: + assert str(r).startswith(" str: + return dedent( + """\ + title: 'AWS NOAA WHOI SST' + description: 'Analysis-ready datasets derived from AWS NOAA WHOI NetCDF' + recipes: + - id: aws-noaa-sea-surface-temp-whoi + object: 'recipe:recipe' + provenance: + providers: + - name: 'AWS NOAA Oceanic CDR' + description: 'Registry of Open Data on AWS National Oceanographic & Atmospheric Administration National Centers for Environmental Information' + roles: + - producer + - licensor + url: s3://noaa-cdr-sea-surface-temp-whoi-pds/ + license: 'Open Data' + maintainers: + - name: 'Jo Contributor' + orcid: '0000-0000-0000-0000' + github: jocontributor123 + """ # noqa: E501 + ) + + +@pytest.fixture +def valid_meta_yaml(with_recipes_list: str) -> dict: + return yaml.load(with_recipes_list) + + +@pytest.fixture +def valid_meta_yaml_dict_object(with_recipes_list: str) -> dict: + with_dict_object = with_recipes_list.replace( + dedent( + """\ + recipes: + - id: aws-noaa-sea-surface-temp-whoi + object: 'recipe:recipe' + """ + ), + dedent( + """\ + recipes: + dict_object: 'recipe:recipes' + """ + ), + ) + return yaml.load(with_dict_object) + + +def test_schema_valid(valid_meta_yaml): + _ = MetaYaml(**valid_meta_yaml) + + +def test_schema_valid_dict_object(valid_meta_yaml_dict_object): + _ = MetaYaml(**valid_meta_yaml_dict_object) + + +@pytest.mark.parametrize( + "field", + [ + "title", + "description", + "recipes", + "provenance", + "maintainers", + ], +) +def test_missing_toplevel_field(valid_meta_yaml, field): + meta_yaml_copy = copy.deepcopy(valid_meta_yaml) + del meta_yaml_copy[field] + if field == "recipes": + # ``recipes`` is the only required field + with pytest.raises(TraitError): + _ = MetaYaml(**meta_yaml_copy) + else: + # all others fields can be left empty without raising an error + _ = MetaYaml(**meta_yaml_copy) + + +@pytest.mark.parametrize( + "subfield", + [ + "id", + "object", + ], +) +def test_missing_recipes_subfield(valid_meta_yaml, subfield): + invalid_meta_yaml = copy.deepcopy(valid_meta_yaml) + del invalid_meta_yaml["recipes"][0][subfield] + + with pytest.raises(TraitError): + _ = MetaYaml(**invalid_meta_yaml) + + +def test_recipes_field_cannot_be_empty_container(): + with pytest.raises(TraitError): + _ = MetaYaml(recipes=[]) + + +# TODO: In a future "strict" mode, ensure provenance fields are all provided. +# -------------------------------------------------------------------------- +# @pytest.mark.parametrize( +# "subfield", +# [ +# "providers", +# "license", +# ], +# ) +# def test_missing_provenance_subfield(valid_meta_yaml, subfield, schema): +# invalid_meta_yaml = copy.deepcopy(valid_meta_yaml) +# del invalid_meta_yaml["provenance"][subfield] +# +# with pytest.raises(TraitError): +# _ = MetaYaml(**meta_yaml_copy) + + +# TODO: In a future "strict" mode, ensure providers fields are all provided. +# -------------------------------------------------------------------------- +# @pytest.mark.parametrize( +# "subfield", +# [ +# "name", +# "description", +# "roles", +# "url", +# ], +# ) +# def test_missing_providers_subfield(valid_meta_yaml, subfield, schema): +# invalid_meta_yaml = copy.deepcopy(valid_meta_yaml) +# del invalid_meta_yaml["provenance"]["providers"][0][subfield] +# +# with pytest.raises(ValidationError, match=f"'{subfield}' is a required property"): +# _ = MetaYaml(**meta_yaml_copy) + + +# TODO: In a future "strict" mode, ensure maintainers fields are all provided. +# ---------------------------------------------------------------------------- +# @pytest.mark.parametrize( +# "subfield", +# [ +# "name", +# "orcid", +# "github", +# ], +# ) +# def test_missing_maintainers_subfield(valid_meta_yaml, subfield, schema): +# invalid_meta_yaml = copy.deepcopy(valid_meta_yaml) +# del invalid_meta_yaml["maintainers"][0][subfield] +# +# with pytest.raises(TraitError): +# _ = MetaYaml(**meta_yaml_copy) diff --git a/tests/unit/test_recipes.py b/tests/unit/test_recipes.py index 9d658c6..1d6e9a2 100644 --- a/tests/unit/test_recipes.py +++ b/tests/unit/test_recipes.py @@ -2,6 +2,7 @@ import pytest from ruamel.yaml import YAML +from traitlets import TraitError from pangeo_forge_runner import Feedstock @@ -29,12 +30,14 @@ def test_recipes_dict(): with open(list_recipe / "meta.yaml") as f: meta = yaml.load(f) - meta["recipes"] = [{"id": "test_1"}, {"id": "test_2"}] + meta["recipes"] = [ + {"id": "test_1", "object": "DICT_VALUE_PLACEHOLDER"}, + {"id": "test_2", "object": "DICT_VALUE_PLACEHOLDER"}, + ] assert meta == feed.get_expanded_meta() def test_recipes_broken(): list_recipe = HERE / "test-recipes/broken-recipe/feedstock" - feed = Feedstock(list_recipe) - with pytest.raises(ValueError): - feed.parse_recipes() + with pytest.raises(TraitError): + _ = Feedstock(list_recipe)