diff --git a/docs/source/developer_guide/metadata.md b/docs/source/developer_guide/metadata.md index a278286b9..af8b8ff6a 100644 --- a/docs/source/developer_guide/metadata.md +++ b/docs/source/developer_guide/metadata.md @@ -198,6 +198,25 @@ When called, the `SchemasProvider` instance is responsible for producing a list Some `SchemasProvider` instances may have schema information which is dynamic, based on the active configuration of the system. In such cases, they are responsible for returning information pertinent to the current configuration. For example, a schema may use an `enum` that holds a set of values pertinent to the system's configuration. In this case, the `SchemasProvider` may alter its static schema to dynamically update its enumerated property corresponding to the current runtime environment. Note however, that care should be taken when dynamically altering schema since it could lead to instances that will no longer load because they (now) fail schema validation if the contents of the enum no longer satisfy previous instance values. Of course, this may very well be proper behavior, depending on the intention. +#### Validators +A [_validator_](https://python-jsonschema.readthedocs.io/en/stable/creating/) is used +to validate instances of a given schema. Within the metadata service, instances +are validated whenever they are retrieved, and immediately prior to their persistence. + +The `SchemasProvider` base class provides an instance of the `Draft7Validator` which should +be sufficient for most schemas. Implementations requiring their own validator, or that wish to +extend the `Draft7Validator`, must override the `get_validator()` method in their + subclass of `SchemasProvider`: +```python + def get_validator(self, schema: dict) -> Any: + """Returns an instance of a validator used to validate instances of the given schema. + + The default implementation uses the `Draft7Validator`. SchemasProviders requiring + the use of a different validator must override this method. + """ + return Draft7Validator(schema, format_checker=draft7_format_checker) +``` + #### Registration Like `Schemaspaces`, `SchemasProvider` implementations are discovered via the entrypoints functionality. When the `SchemaManager` starts, and after it loads all `Schemaspaces`, it fetches all entrypoint instances named by the group `'metadata.schemas_providers'`, and uses the entrypoint data to load each found instance. diff --git a/elyra/metadata/manager.py b/elyra/metadata/manager.py index 4a1631907..ecb6a9ccb 100644 --- a/elyra/metadata/manager.py +++ b/elyra/metadata/manager.py @@ -13,20 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import os import re from typing import Any from typing import Dict from typing import List from typing import Union -from jsonschema import draft7_format_checker -from jsonschema import validate from jsonschema import ValidationError from traitlets import Type # noqa H306 from traitlets.config import LoggingConfigurable # noqa H306 -from elyra.metadata.error import SchemaNotFoundError from elyra.metadata.metadata import Metadata from elyra.metadata.schema import SchemaManager from elyra.metadata.storage import FileMetadataStore @@ -155,9 +151,8 @@ def validate(self, name: str, metadata: Metadata) -> None: f"Instance '{name}' in the {self.schemaspace} schemaspace is missing a 'schema_name' field!" ) - schema = self._get_schema(schema_name) # returns a value or throws try: - validate(instance=metadata_dict, schema=schema, format_checker=draft7_format_checker) + self.schema_mgr.validate_instance(self.schemaspace, schema_name, metadata_dict) except ValidationError as ve: # Because validation errors are so verbose, only provide the first line. first_line = str(ve).partition("\n")[0] @@ -179,21 +174,6 @@ def get_normalized_name(name: str) -> str: name = name + "_0" return name - def _get_schema(self, schema_name: str) -> dict: - """Loads the schema based on the schema_name and returns the loaded schema json. - Throws ValidationError if schema file is not present. - """ - schema_json = self.schema_mgr.get_schema(self.schemaspace, schema_name) - if schema_json is None: - schema_file = os.path.join(os.path.dirname(__file__), "schemas", schema_name + ".json") - if not os.path.exists(schema_file): - self.log.error( - f"The file for schema '{schema_name}' is missing from its expected location: '{schema_file}'" - ) - raise SchemaNotFoundError(f"The file for schema '{schema_name}' is missing!") - - return schema_json - def _save(self, name: str, metadata: Metadata, for_update: bool = False, for_migration: bool = False) -> Metadata: if not metadata: raise ValueError("An instance of class 'Metadata' was not provided.") diff --git a/elyra/metadata/schema.py b/elyra/metadata/schema.py index f1098d369..7ee8e7746 100644 --- a/elyra/metadata/schema.py +++ b/elyra/metadata/schema.py @@ -22,6 +22,7 @@ from logging import Logger import os import re +from typing import Any from typing import Dict from typing import List from typing import Optional @@ -37,6 +38,7 @@ def final(meth): from entrypoints import get_group_all from jsonschema import draft7_format_checker +from jsonschema import Draft7Validator from jsonschema import validate from jsonschema import ValidationError from traitlets.config import LoggingConfigurable @@ -56,18 +58,22 @@ class SchemaManager(SingletonConfigurable): # Maps SchemaspaceID AND SchemaspaceName to Schemaspace instance (two entries from Schemaspace instance) schemaspaces: Dict[str, "Schemaspace"] - # Maps SchemaspaceID to ShemaspaceName + # Maps SchemaspaceID to SchemaspaceName schemaspace_id_to_name: Dict[str, str] # Maps SchemaspaceID to mapping of schema name to SchemasProvider schemaspace_schemasproviders: Dict[str, Dict[str, "SchemasProvider"]] + # Maps SchemaspaceID to mapping of schema name to validator + schemaspace_schema_validators: Dict[str, Dict[str, Any]] + def __init__(self, **kwargs): super().__init__(**kwargs) self.schemaspaces = {} self.schemaspace_id_to_name = {} self.schemaspace_schemasproviders = {} + self.schemaspace_schema_validators = {} # The following exposes the metadata-test schemaspace if true or 1. # Metadata testing will enable this env. Note: this cannot be globally @@ -137,6 +143,12 @@ def clear_all(self) -> None: self.log.debug("SchemaManager: Reloading all schemas for all schemaspaces.") self._load_schemaspace_schemas() + def validate_instance(self, schemaspace_name_or_id: str, schema_name: str, instance: dict) -> None: + self._validate_schemaspace(schemaspace_name_or_id) + schemaspace_id = self.schemaspaces[schemaspace_name_or_id].id + validator = self.schemaspace_schema_validators[schemaspace_id][schema_name] + validator.validate(instance) + def _validate_schemaspace(self, schemaspace_name_or_id: str) -> None: """Ensures the schemaspace is valid and raises ValueError if it is not.""" if schemaspace_name_or_id.lower() not in self.schemaspaces: @@ -200,6 +212,7 @@ def _load_schemas_providers(self): f"SchemasProvider instance '{schemas_provider_ep.name}' is not an " f"instance of '{SchemasProvider.__name__}'!" ) + provider_validators = {} schemas = schemas_provider.get_schemas() for schema in schemas: try: @@ -230,7 +243,10 @@ def _load_schemas_providers(self): self.schemaspaces[schemaspace_id].add_schema(schema) providers = self.schemaspace_schemasproviders.get(schemaspace_id, {}) providers[schema_name] = schemas_provider # Capture the schemasprovider for this schema + provider_validators = self.schemaspace_schema_validators.get(schemaspace_id, {}) + provider_validators[schema_name] = schemas_provider.get_validator(schema) # Capture validator self.schemaspace_schemasproviders[schemaspace_id] = providers + self.schemaspace_schema_validators[schemaspace_id] = provider_validators except Exception as schema_err: self.log.error( @@ -422,3 +438,11 @@ def migrate(self, *args, **kwargs) -> List[str]: The method will return a list of migrated instances or an empty array. """ return list() + + def get_validator(self, schema: dict) -> Any: + """Returns an instance of a validator used to validate instances of the given schema. + + The default implementation uses the `Draft7Validator`. SchemasProviders requiring the use + of a different validator must override this method. + """ + return Draft7Validator(schema, format_checker=draft7_format_checker) diff --git a/elyra/tests/metadata/test_metadata.py b/elyra/tests/metadata/test_metadata.py index af3afd6d7..ee84f26c3 100644 --- a/elyra/tests/metadata/test_metadata.py +++ b/elyra/tests/metadata/test_metadata.py @@ -689,6 +689,37 @@ def test_manager_hierarchy_remove(tests_hierarchy_manager, factory_location, sha assert byo_1.resource.startswith(str(factory_location)) +@pytest.mark.skipif( + os.getenv("TEST_VALIDATION_PERFORMANCE", "0") != "1", + reason="test_validation_performance - enable via env TEST_VALIDATION_PERFORMANCE=1", +) +def test_validation_performance(): + import psutil + + metadata_mgr = MetadataManager(schemaspace=METADATA_TEST_SCHEMASPACE) + metadata_dict = {**valid_metadata_json} + metadata = Metadata.from_dict(METADATA_TEST_SCHEMASPACE_ID, metadata_dict) + + process = psutil.Process(os.getpid()) + # warm up + metadata_mgr.validate("perf_test", metadata) + + iterations = 10000 + memory_start = process.memory_info() + t0 = time.time() + for _ in range(0, iterations): + metadata_mgr.validate("perf_test", metadata) + + t1 = time.time() + memory_end = process.memory_info() + diff = (memory_end.rss - memory_start.rss) / 1024 + print( + f"Memory: {diff:,} kb, Start: {memory_start.rss / 1024 / 1024:,.3f} mb, " + f"End: {memory_end.rss / 1024 / 1024:,.3f} mb., " + f"Elapsed time: {t1-t0:.3f}s over {iterations} iterations." + ) + + # ########################## MetadataStore Tests ########################### def test_store_schemaspace(store_manager, schemaspace_location): # Delete the metadata dir contents and attempt listing metadata diff --git a/elyra/tests/metadata/test_metadata_app.py b/elyra/tests/metadata/test_metadata_app.py index 4812fc374..a820180cb 100644 --- a/elyra/tests/metadata/test_metadata_app.py +++ b/elyra/tests/metadata/test_metadata_app.py @@ -1100,7 +1100,7 @@ def test_remove_no_name(script_runner): assert "ERROR: '--name' is a required parameter." in ret.stdout -def test_remove_with_no_equals(script_runner): +def test_remove_with_no_equals(script_runner, mock_data_dir): # Attempt removal w/o the '=' between parameter and value metadata_manager = MetadataManager(schemaspace=METADATA_TEST_SCHEMASPACE) valid = Metadata(**valid_metadata_json)