diff --git a/CHANGELOG.md b/CHANGELOG.md index 841d1f890..db4a241c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ # Changelog -## v0.0.25 (in progress) +## v0.0.25 - Update GeoMx NGS directory schema +- Added EPIC dataset field derived_dataset_type to UNIQUE_FIELDS_MAP ## v0.0.24 - Release MERFISH diff --git a/examples/dataset-examples/bad-no-assay-type/README.md b/examples/dataset-examples/bad-no-assay-type/README.md index 00f25a7d7..05410254e 100644 --- a/examples/dataset-examples/bad-no-assay-type/README.md +++ b/examples/dataset-examples/bad-no-assay-type/README.md @@ -1,4 +1,5 @@ ``` Preflight Errors: -- No assay_type or dataset_type in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv. -``` \ No newline at end of file +- 'Required dataset field not present in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv. + One of the following is required: assay_type, dataset_type, derived_dataset_type.' +``` diff --git a/src/ingest_validation_tools/enums.py b/src/ingest_validation_tools/enums.py index 178aeea60..d3909b488 100644 --- a/src/ingest_validation_tools/enums.py +++ b/src/ingest_validation_tools/enums.py @@ -208,3 +208,18 @@ class Sample(EntityTypes): @classmethod def with_parent_type(cls): return [*[entity_type for entity_type in cls], OtherTypes.SAMPLE] + + +# These should all be considered to be mutually exclusive, +# even within the same type +UNIQUE_FIELDS_MAP = { + OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"}, + OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"}, + DatasetType.DATASET: {"assay_type", "dataset_type", "derived_dataset_type"}, + OtherTypes.SOURCE: {"strain_rrid"}, + OtherTypes.ORGAN: {"organ_id"}, # Deprecated + OtherTypes.SAMPLE: {"sample_id"}, +} +OTHER_FIELDS_UNIQUE_FIELDS_MAP = { + k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET +} diff --git a/src/ingest_validation_tools/schema_loader.py b/src/ingest_validation_tools/schema_loader.py index 8018826d6..fb98d53f8 100644 --- a/src/ingest_validation_tools/schema_loader.py +++ b/src/ingest_validation_tools/schema_loader.py @@ -8,6 +8,7 @@ from typing import Dict, List, Optional, Sequence, Set, Union from ingest_validation_tools.enums import ( + UNIQUE_FIELDS_MAP, DatasetType, EntityTypes, OtherTypes, @@ -91,13 +92,8 @@ def get_row_data(self): self.is_cedar = True else: self.is_cedar = False + self.get_dataset_type_value() self.version = self.rows[0].get("version") - assay_type = self.rows[0].get("assay_type") - dataset_type = self.rows[0].get("dataset_type") - if assay_type is not None and dataset_type is not None: - raise PreflightError(f"Found both assay_type and dataset_type for path {self.path}!") - else: - self.dataset_type = assay_type if assay_type else dataset_type def get_assayclassifier_data(self): self.vitessce_hints = self.soft_assay_data.get("vitessce-hints", []) @@ -109,6 +105,19 @@ def get_assayclassifier_data(self): contains = self.soft_assay_data.get("must-contain", []) self.contains = [schema.lower() for schema in contains] + def get_dataset_type_value(self): + dataset_fields = { + k: v for k, v in self.rows[0].items() if k in UNIQUE_FIELDS_MAP[DatasetType.DATASET] + } + values_found = list(dataset_fields.values()) + if len(values_found) == 0: + return + elif len(values_found) > 1: + raise PreflightError( + f"Found multiple dataset fields for path {self.path}: {dataset_fields}" + ) + self.dataset_type = values_found[0] + @dataclass class EntityTypeInfo: diff --git a/src/ingest_validation_tools/validation_utils.py b/src/ingest_validation_tools/validation_utils.py index b50f29556..d85095d21 100644 --- a/src/ingest_validation_tools/validation_utils.py +++ b/src/ingest_validation_tools/validation_utils.py @@ -12,7 +12,14 @@ DirectoryValidationErrors, validate_directory, ) -from ingest_validation_tools.enums import DatasetType, EntityTypes, OtherTypes, Sample +from ingest_validation_tools.enums import ( + OTHER_FIELDS_UNIQUE_FIELDS_MAP, + UNIQUE_FIELDS_MAP, + DatasetType, + EntityTypes, + OtherTypes, + Sample, +) from ingest_validation_tools.schema_loader import ( EntityTypeInfo, PreflightError, @@ -21,18 +28,6 @@ ) from ingest_validation_tools.table_validator import ReportType -UNIQUE_FIELDS_MAP = { - OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"}, - OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"}, - DatasetType.DATASET: {"assay_type", "dataset_type"}, - OtherTypes.SOURCE: {"strain_rrid"}, - OtherTypes.ORGAN: {"organ_id"}, # Deprecated? - OtherTypes.SAMPLE: {"sample_id"}, -} -OTHER_FIELDS_UNIQUE_FIELDS_MAP = { - k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET -} - def match_field_in_unique_fields( match_fields: list, path: str, dataset=True @@ -86,7 +81,9 @@ def get_schema_version( return other_type message = [] if not [field for field in UNIQUE_FIELDS_MAP[DatasetType.DATASET] if field in rows[0].keys()]: - message.append(f"No assay_type or dataset_type in {path}.") + message.append( + f"Required dataset field not present in {path}. One of the following is required: {', '.join(sorted(UNIQUE_FIELDS_MAP[DatasetType.DATASET]))}" + ) if "channel_id" in rows[0]: message.append('Has "channel_id": Antibodies TSV found where metadata TSV expected.') elif "orcid_id" in rows[0]: