Skip to content

Commit

Permalink
Entity type constraints (#1330) (#1333)
Browse files Browse the repository at this point in the history
* work in progress on entity constraints querying

* basic logic sketched in

* fixing linting error

* very unfinished, need to review entity constraint endpoint documentation further

* architected, testing, definitely broken

* at least one existing test passing, need to run all + design new tests

* added testing

* still testing, lots of edge cases

* messy code, will tidy up

* removed unnecessary code

* fixing formatting errors

* putting enums where they belong

* forgot a file

* need to fix my linter

* made sample constraints work I hope

* working on tests

* fixed broken test

* removed testing from dataset file, hopefully fixed requirements-dev install

* testing fixing action

* second testing fixing actions

* changes to fixtures from assaytype endpoint; fixing globus_token mistake

* fixing malformed constraints endpoint query

* removing breakpoint

* fixing sample checking logic error

* removing breakpoint again

* fixing adding dataset sub_type to SchemaVersion.entity_type_info

* fixing the same query URL mistake in test file

* updating test output with line number changes

Co-authored-by: gesinaphillips <[email protected]>
  • Loading branch information
jpuerto-psc and gesinaphillips authored May 20, 2024
1 parent b42126e commit de58a30
Show file tree
Hide file tree
Showing 26 changed files with 890 additions and 194 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ jobs:
architecture: 'x64'

- run: sudo apt-get install parallel
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests parameterized
- run: pip install -r requirements.txt
- run: pip install -r requirements-dev.txt

Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- Update MUSIC directory schema
- Add semantic version to plugin test base class
- Fix row number mismatch between validation and spreadsheet validator response
- Adding entity constraints check

## v0.0.18

Expand Down
2 changes: 1 addition & 1 deletion examples/dataset-examples/bad-mixed/fixtures.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"assaytype": {"CODEX": {"assaytype": "CODEX", "contains-pii": false, "dataset-type": "CODEX", "description": "CODEX", "dir-schema": "codex-v1", "primary": true, "tbl-schema": "codex-v0", "vitessce-hints": []}, "SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNARE-seq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {}}
{"assaytype": {"CODEX": {"assaytype": "CODEX", "contains-pii": false, "dataset-type": "CODEX", "description": "CODEX", "dir-schema": "codex-v1", "primary": true, "tbl-schema": "codex-v0", "vitessce-hints": []}, "SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNAREseq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {}}
2 changes: 1 addition & 1 deletion examples/dataset-examples/bad-scatacseq-data/fixtures.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNARE-seq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNAREseq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
2 changes: 1 addition & 1 deletion examples/dataset-examples/bad-scrnaseq-v0/fixtures.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"assaytype": {"scRNAseq-10xGenomics": {"assaytype": "scRNAseq-10xGenomics-v3", "contains-pii": true, "dataset-type": "RNAseq", "description": "scRNA-seq (10x Genomics v3)", "dir-schema": "scrnaseq-v0", "primary": true, "tbl-schema": "scrnaseq-v0", "vitessce-hints": []}}, "validation": {"scRNAseq-10xGenomics-v3": {}, "contributors": {}}}
{"assaytype": {"scRNAseq-10xGenomics": {"assaytype": "scRNAseq-10xGenomics-v3", "contains-pii": true, "dataset-type": "RNAseq", "description": "RNAseq (10x Genomics v3)", "dir-schema": "scrnaseq-v0", "primary": true, "tbl-schema": "scrnaseq-v0", "vitessce-hints": []}}, "validation": {"scRNAseq-10xGenomics-v3": {}, "contributors": {}}}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNARE-seq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNAREseq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNARE-seq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v1", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNAREseq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v1", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
2 changes: 1 addition & 1 deletion examples/dataset-iec-examples/bad-example/fixtures.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNARE-seq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNAREseq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
2 changes: 1 addition & 1 deletion examples/dataset-iec-examples/good-example/fixtures.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNARE-seq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
{"assaytype": {"SNARE-seq2": {"assaytype": "SNARE-ATACseq2", "contains-pii": true, "dataset-type": "ATACseq", "description": "snATACseq (SNAREseq2)", "dir-schema": "scatacseq-v0", "primary": true, "tbl-schema": "scatacseq-v0", "vitessce-hints": []}}, "validation": {"SNARE-ATACseq2": {}, "contributors": {}}}
1 change: 1 addition & 0 deletions requirements-dev.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ flake8==6.1.0
isort==5.13.2
mypy==0.790
pandas==2.0.1
parameterized==0.9.0
pip-tools==7.3.0
pytest==5.4.1
4 changes: 4 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ pandas==2.0.1 \
--hash=sha256:f25e23a03f7ad7211ffa30cb181c3e5f6d96a8e4cb22898af462a7333f8a74eb \
--hash=sha256:fe7914d8ddb2d54b900cec264c090b88d141a1eed605c9539a187dbc2547f022
# via -r requirements-dev.in
parameterized==0.9.0 \
--hash=sha256:4e0758e3d41bea3bbd05ec14fc2c24736723f243b28d702081aef438c9372b1b \
--hash=sha256:7fc905272cefa4f364c1a3429cbbe9c0f98b793988efb5bf90aac80f08db09b1
# via -r requirements-dev.in
pathspec==0.12.1 \
--hash=sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08 \
--hash=sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712
Expand Down
10 changes: 5 additions & 5 deletions script-docs/README-validate_tsv.py.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
```text
usage: validate_tsv.py [-h] --path PATH --schema
{sample,sample-block,sample-suspension,sample-section,antibodies,contributors,metadata,murine-source}
{sample,sample-block,sample-suspension,sample-section,antibodies,contributors,metadata,source}
[--globus_token GLOBUS_TOKEN]
[--output {as_md,as_text,as_text_list,as_yaml}]
[--output {as_text,as_md}]
Validate a HuBMAP TSV. REMINDER: Besides running validate_tsv.py, you should also run validate_upload.py before submission.
Validate a HuBMAP TSV. REMINDER: Use of validate_tsv.py is deprecated; use the HuBMAP Metadata Spreadsheet Validator to validate single TSVs instead (https://metadatavalidator.metadatacenter.org).
optional arguments:
-h, --help show this help message and exit
--path PATH TSV path
--schema {sample,sample-block,sample-suspension,sample-section,antibodies,contributors,metadata,murine-source}
--schema {sample,sample-block,sample-suspension,sample-section,antibodies,contributors,metadata,source}
--globus_token GLOBUS_TOKEN
Token for URL checking using Entity API.
--output {as_md,as_text,as_text_list,as_yaml}
--output {as_text,as_md}
Exit status codes:
0: Validation passed
Expand Down
4 changes: 2 additions & 2 deletions src/generate_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
enum_maps_to_lists,
get_directory_schema,
get_fields_wo_headers,
get_is_assay,
get_pipeline_infos,
get_table_schema,
)
from ingest_validation_tools.validation_utils import OtherTypes


def main():
Expand All @@ -38,7 +38,7 @@ def main():
table_schema_versions = dict_table_schema_versions()[args.type]
assert table_schema_versions, f"No versions for {args.type}"

is_assay = get_is_assay(args.type)
is_assay = args.type not in OtherTypes.with_sample_subtypes()
table_schemas = {
v.version: get_table_schema(v, keep_headers=True) for v in table_schema_versions
}
Expand Down
56 changes: 56 additions & 0 deletions src/ingest_validation_tools/enums.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum, unique
from typing import Dict, List

"""
Expand Down Expand Up @@ -148,3 +149,58 @@
"derived_datasets",
],
}


@unique
class Sample(str, Enum):
BLOCK = "sample-block"
SUSPENSION = "sample-suspension"
SECTION = "sample-section"
ORGAN = "organ"

# TODO: I believe this can be streamlined with the StrEnum class added in 3.11
@classmethod
def full_names_list(cls) -> List[str]:
return [sample_type.value for sample_type in cls]

@classmethod
def just_subtypes_list(cls) -> List[str]:
return [sample_type.name.lower() for sample_type in cls]

@classmethod
def get_key_from_val(cls, val) -> str:
match = [sample_type.name for sample_type in cls if sample_type.value == val]
if not match:
return ""
return match[0]


class DatasetType(str, Enum):
DATASET = "dataset"


@unique
class OtherTypes(str, Enum):
ANTIBODIES = "antibodies"
CONTRIBUTORS = "contributors"
SOURCE = "source"
SAMPLE = "sample"
ORGAN = "organ"
DONOR = "donor"

@classmethod
def value_list(cls):
return [other_type.value for other_type in cls]

@classmethod
def get_sample_types(cls):
return Sample.just_subtypes_list()

@classmethod
def get_sample_types_full_names(cls):
return Sample.full_names_list()

@classmethod
def with_sample_subtypes(cls):
all_types = [*cls.value_list(), *cls.get_sample_types_full_names()]
return all_types
14 changes: 12 additions & 2 deletions src/ingest_validation_tools/error_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class ErrorDict:
default_factory=lambda: defaultdict(list)
)
metadata_url_errors: DefaultDict[str, List] = field(default_factory=lambda: defaultdict(list))
metadata_constraint_errors: DefaultDict[str, List] = field(
default_factory=lambda: defaultdict(list)
)
reference: DefaultDict[str, Dict] = field(default_factory=lambda: defaultdict(dict))
plugin: Dict[str, List[str]] = field(default_factory=dict)
plugin_skip: Optional[str] = None
Expand All @@ -66,6 +69,7 @@ class ErrorDict:
"metadata_validation_local": "Local Validation Errors",
"metadata_validation_api": "Spreadsheet Validator Errors",
"metadata_url_errors": "URL Check Errors",
"metadata_constraint_errors": "Entity Constraint Errors",
"reference": "Reference Errors",
"plugin": "Data File Errors",
"plugin_skip": "Fatal Errors",
Expand Down Expand Up @@ -95,15 +99,21 @@ def errors_by_path(self, path: str, selected_fields: Optional[List[str]] = None)
return errors

def online_only_errors_by_path(self, path: str):
return self.errors_by_path(path, ["metadata_url_errors", "metadata_validation_api"])
return self.errors_by_path(
path, ["metadata_url_errors", "metadata_validation_api", "metadata_constraint_errors"]
)

def tsv_only_errors_by_path(self, path: str, local_allowed=True) -> List[str]:
"""
For use in front-end single TSV validation.
Turn off support for local validation by passing local_allowed=False
"""
errors = []
selected_fields = ["metadata_url_errors", "metadata_validation_api"]
selected_fields = [
"metadata_url_errors",
"metadata_validation_api",
"metadata_constraint_errors",
]
if local_allowed:
selected_fields.append("metadata_validation_local")
path_errors = self.errors_by_path(path, selected_fields)
Expand Down
24 changes: 6 additions & 18 deletions src/ingest_validation_tools/schema_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Set, Union

from ingest_validation_tools.enums import shared_enums
from ingest_validation_tools.enums import OtherTypes, shared_enums
from ingest_validation_tools.yaml_include_loader import load_yaml

_table_schemas_path = Path(__file__).parent / "table-schemas"
Expand Down Expand Up @@ -40,7 +40,6 @@ class SchemaVersion:
directory_path: Optional[Path] = None
table_schema: str = ""
path: Union[Path, str] = ""
# TODO: testing these out to see if it streamlines anything
contributors_paths: List[str] = field(default_factory=list)
antibodies_paths: List[str] = field(default_factory=list)
rows: List = field(default_factory=list)
Expand All @@ -51,6 +50,10 @@ class SchemaVersion:
dir_schema: str = ""
metadata_type: str = "assays"
contains: List = field(default_factory=list)
ancestor_entities: Dict = field(default_factory=dict)
entity_type_info: Dict = field(
default_factory=dict
) # entity_type, entity_sub_type, entity_sub_type_val; for constraint checking

def __post_init__(self):
if type(self.path) is str:
Expand All @@ -63,7 +66,7 @@ def __post_init__(self):
an invalid path: {self.path}. Error: {e}
"""
)
if get_is_assay(self.schema_name):
if self.schema_name not in OtherTypes.with_sample_subtypes():
self.metadata_type = "assays"
else:
self.metadata_type = "others"
Expand Down Expand Up @@ -228,21 +231,6 @@ def get_table_schema(
return schema


def get_is_assay(schema_name: str) -> bool:
# TODO: read from file system... but larger refactor may make it redundant.
return schema_name not in [
"donor",
"organ",
"sample",
"antibodies",
"contributors",
"sample-block",
"sample-section",
"sample-suspension",
"murine-source",
]


def get_directory_schema(
dir_schema: Optional[str] = None,
directory_type: Optional[str] = None,
Expand Down
Loading

0 comments on commit de58a30

Please sign in to comment.