Skip to content

Commit

Permalink
Add support for schema validators (#2829)
Browse files Browse the repository at this point in the history
* Add support for schema validators

* Remove unnecessary statement caught by CodeQL
  • Loading branch information
kevin-bates authored Jul 19, 2022
1 parent b92d92e commit 35ff340
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 23 deletions.
19 changes: 19 additions & 0 deletions docs/source/developer_guide/metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,25 @@ When called, the `SchemasProvider` instance is responsible for producing a list

Some `SchemasProvider` instances may have schema information which is dynamic, based on the active configuration of the system. In such cases, they are responsible for returning information pertinent to the current configuration. For example, a schema may use an `enum` that holds a set of values pertinent to the system's configuration. In this case, the `SchemasProvider` may alter its static schema to dynamically update its enumerated property corresponding to the current runtime environment. Note however, that care should be taken when dynamically altering schema since it could lead to instances that will no longer load because they (now) fail schema validation if the contents of the enum no longer satisfy previous instance values. Of course, this may very well be proper behavior, depending on the intention.

#### Validators
A [_validator_](https://python-jsonschema.readthedocs.io/en/stable/creating/) is used
to validate instances of a given schema. Within the metadata service, instances
are validated whenever they are retrieved, and immediately prior to their persistence.

The `SchemasProvider` base class provides an instance of the `Draft7Validator` which should
be sufficient for most schemas. Implementations requiring their own validator, or that wish to
extend the `Draft7Validator`, must override the `get_validator()` method in their
subclass of `SchemasProvider`:
```python
def get_validator(self, schema: dict) -> Any:
"""Returns an instance of a validator used to validate instances of the given schema.
The default implementation uses the `Draft7Validator`. SchemasProviders requiring
the use of a different validator must override this method.
"""
return Draft7Validator(schema, format_checker=draft7_format_checker)
```

#### Registration
Like `Schemaspaces`, `SchemasProvider` implementations are discovered via the entrypoints functionality. When the `SchemaManager` starts, and after it loads all `Schemaspaces`, it fetches all entrypoint instances named by the group `'metadata.schemas_providers'`, and uses the entrypoint data to load each found instance.

Expand Down
22 changes: 1 addition & 21 deletions elyra/metadata/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import re
from typing import Any
from typing import Dict
from typing import List
from typing import Union

from jsonschema import draft7_format_checker
from jsonschema import validate
from jsonschema import ValidationError
from traitlets import Type # noqa H306
from traitlets.config import LoggingConfigurable # noqa H306

from elyra.metadata.error import SchemaNotFoundError
from elyra.metadata.metadata import Metadata
from elyra.metadata.schema import SchemaManager
from elyra.metadata.storage import FileMetadataStore
Expand Down Expand Up @@ -155,9 +151,8 @@ def validate(self, name: str, metadata: Metadata) -> None:
f"Instance '{name}' in the {self.schemaspace} schemaspace is missing a 'schema_name' field!"
)

schema = self._get_schema(schema_name) # returns a value or throws
try:
validate(instance=metadata_dict, schema=schema, format_checker=draft7_format_checker)
self.schema_mgr.validate_instance(self.schemaspace, schema_name, metadata_dict)
except ValidationError as ve:
# Because validation errors are so verbose, only provide the first line.
first_line = str(ve).partition("\n")[0]
Expand All @@ -179,21 +174,6 @@ def get_normalized_name(name: str) -> str:
name = name + "_0"
return name

def _get_schema(self, schema_name: str) -> dict:
"""Loads the schema based on the schema_name and returns the loaded schema json.
Throws ValidationError if schema file is not present.
"""
schema_json = self.schema_mgr.get_schema(self.schemaspace, schema_name)
if schema_json is None:
schema_file = os.path.join(os.path.dirname(__file__), "schemas", schema_name + ".json")
if not os.path.exists(schema_file):
self.log.error(
f"The file for schema '{schema_name}' is missing from its expected location: '{schema_file}'"
)
raise SchemaNotFoundError(f"The file for schema '{schema_name}' is missing!")

return schema_json

def _save(self, name: str, metadata: Metadata, for_update: bool = False, for_migration: bool = False) -> Metadata:
if not metadata:
raise ValueError("An instance of class 'Metadata' was not provided.")
Expand Down
26 changes: 25 additions & 1 deletion elyra/metadata/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from logging import Logger
import os
import re
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
Expand All @@ -37,6 +38,7 @@ def final(meth):

from entrypoints import get_group_all
from jsonschema import draft7_format_checker
from jsonschema import Draft7Validator
from jsonschema import validate
from jsonschema import ValidationError
from traitlets.config import LoggingConfigurable
Expand All @@ -56,18 +58,22 @@ class SchemaManager(SingletonConfigurable):
# Maps SchemaspaceID AND SchemaspaceName to Schemaspace instance (two entries from Schemaspace instance)
schemaspaces: Dict[str, "Schemaspace"]

# Maps SchemaspaceID to ShemaspaceName
# Maps SchemaspaceID to SchemaspaceName
schemaspace_id_to_name: Dict[str, str]

# Maps SchemaspaceID to mapping of schema name to SchemasProvider
schemaspace_schemasproviders: Dict[str, Dict[str, "SchemasProvider"]]

# Maps SchemaspaceID to mapping of schema name to validator
schemaspace_schema_validators: Dict[str, Dict[str, Any]]

def __init__(self, **kwargs):
super().__init__(**kwargs)

self.schemaspaces = {}
self.schemaspace_id_to_name = {}
self.schemaspace_schemasproviders = {}
self.schemaspace_schema_validators = {}

# The following exposes the metadata-test schemaspace if true or 1.
# Metadata testing will enable this env. Note: this cannot be globally
Expand Down Expand Up @@ -137,6 +143,12 @@ def clear_all(self) -> None:
self.log.debug("SchemaManager: Reloading all schemas for all schemaspaces.")
self._load_schemaspace_schemas()

def validate_instance(self, schemaspace_name_or_id: str, schema_name: str, instance: dict) -> None:
self._validate_schemaspace(schemaspace_name_or_id)
schemaspace_id = self.schemaspaces[schemaspace_name_or_id].id
validator = self.schemaspace_schema_validators[schemaspace_id][schema_name]
validator.validate(instance)

def _validate_schemaspace(self, schemaspace_name_or_id: str) -> None:
"""Ensures the schemaspace is valid and raises ValueError if it is not."""
if schemaspace_name_or_id.lower() not in self.schemaspaces:
Expand Down Expand Up @@ -200,6 +212,7 @@ def _load_schemas_providers(self):
f"SchemasProvider instance '{schemas_provider_ep.name}' is not an "
f"instance of '{SchemasProvider.__name__}'!"
)
provider_validators = {}
schemas = schemas_provider.get_schemas()
for schema in schemas:
try:
Expand Down Expand Up @@ -230,7 +243,10 @@ def _load_schemas_providers(self):
self.schemaspaces[schemaspace_id].add_schema(schema)
providers = self.schemaspace_schemasproviders.get(schemaspace_id, {})
providers[schema_name] = schemas_provider # Capture the schemasprovider for this schema
provider_validators = self.schemaspace_schema_validators.get(schemaspace_id, {})
provider_validators[schema_name] = schemas_provider.get_validator(schema) # Capture validator
self.schemaspace_schemasproviders[schemaspace_id] = providers
self.schemaspace_schema_validators[schemaspace_id] = provider_validators

except Exception as schema_err:
self.log.error(
Expand Down Expand Up @@ -422,3 +438,11 @@ def migrate(self, *args, **kwargs) -> List[str]:
The method will return a list of migrated instances or an empty array.
"""
return list()

def get_validator(self, schema: dict) -> Any:
"""Returns an instance of a validator used to validate instances of the given schema.
The default implementation uses the `Draft7Validator`. SchemasProviders requiring the use
of a different validator must override this method.
"""
return Draft7Validator(schema, format_checker=draft7_format_checker)
31 changes: 31 additions & 0 deletions elyra/tests/metadata/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,37 @@ def test_manager_hierarchy_remove(tests_hierarchy_manager, factory_location, sha
assert byo_1.resource.startswith(str(factory_location))


@pytest.mark.skipif(
os.getenv("TEST_VALIDATION_PERFORMANCE", "0") != "1",
reason="test_validation_performance - enable via env TEST_VALIDATION_PERFORMANCE=1",
)
def test_validation_performance():
import psutil

metadata_mgr = MetadataManager(schemaspace=METADATA_TEST_SCHEMASPACE)
metadata_dict = {**valid_metadata_json}
metadata = Metadata.from_dict(METADATA_TEST_SCHEMASPACE_ID, metadata_dict)

process = psutil.Process(os.getpid())
# warm up
metadata_mgr.validate("perf_test", metadata)

iterations = 10000
memory_start = process.memory_info()
t0 = time.time()
for _ in range(0, iterations):
metadata_mgr.validate("perf_test", metadata)

t1 = time.time()
memory_end = process.memory_info()
diff = (memory_end.rss - memory_start.rss) / 1024
print(
f"Memory: {diff:,} kb, Start: {memory_start.rss / 1024 / 1024:,.3f} mb, "
f"End: {memory_end.rss / 1024 / 1024:,.3f} mb., "
f"Elapsed time: {t1-t0:.3f}s over {iterations} iterations."
)


# ########################## MetadataStore Tests ###########################
def test_store_schemaspace(store_manager, schemaspace_location):
# Delete the metadata dir contents and attempt listing metadata
Expand Down
2 changes: 1 addition & 1 deletion elyra/tests/metadata/test_metadata_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,7 @@ def test_remove_no_name(script_runner):
assert "ERROR: '--name' is a required parameter." in ret.stdout


def test_remove_with_no_equals(script_runner):
def test_remove_with_no_equals(script_runner, mock_data_dir):
# Attempt removal w/o the '=' between parameter and value
metadata_manager = MetadataManager(schemaspace=METADATA_TEST_SCHEMASPACE)
valid = Metadata(**valid_metadata_json)
Expand Down

0 comments on commit 35ff340

Please sign in to comment.