From 66cd4fe11af0475121475b98dd50213f9755e4f3 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 30 Aug 2024 13:56:05 -0400 Subject: [PATCH 1/7] adding epic dataset unique field --- src/ingest_validation_tools/validation_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ingest_validation_tools/validation_utils.py b/src/ingest_validation_tools/validation_utils.py index b50f29556..8365c9aa4 100644 --- a/src/ingest_validation_tools/validation_utils.py +++ b/src/ingest_validation_tools/validation_utils.py @@ -21,12 +21,14 @@ ) from ingest_validation_tools.table_validator import ReportType +# These should all be considered to be mutually exclusive, +# even within the same type UNIQUE_FIELDS_MAP = { OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"}, OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"}, - DatasetType.DATASET: {"assay_type", "dataset_type"}, + DatasetType.DATASET: {"assay_type", "dataset_type", "derived_dataset_type"}, OtherTypes.SOURCE: {"strain_rrid"}, - OtherTypes.ORGAN: {"organ_id"}, # Deprecated? + OtherTypes.ORGAN: {"organ_id"}, # Deprecated OtherTypes.SAMPLE: {"sample_id"}, } OTHER_FIELDS_UNIQUE_FIELDS_MAP = { From de323c1dc66d77f3964af377d1b13c56283535af Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 30 Aug 2024 13:57:08 -0400 Subject: [PATCH 2/7] forgot changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea7d39bc2..776676e41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ - Update Visium with probes directory schema - Update Visium no probes directory schema - Change to EntityTypeInfo constraint format to support constraints endpoint +- Added EPIC dataset field derived_dataset_type to UNIQUE_FIELDS_MAP ## v0.0.23 - Add token to validation_utils.get_assaytype_data, replace URL string concatenation with urllib From d145ebc54f57e6f6f981580c485ee950c5a62030 Mon Sep 17 00:00:00 2001 From: jpuerto-psc <68066250+jpuerto-psc@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:59:39 -0400 Subject: [PATCH 3/7] Update CHANGELOG.md --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 776676e41..7b224cfd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## v0.0.25 +- Added EPIC dataset field derived_dataset_type to UNIQUE_FIELDS_MAP + ## v0.0.24 - Release MERFISH - Add MERFISH directory schema @@ -18,7 +21,6 @@ - Update Visium with probes directory schema - Update Visium no probes directory schema - Change to EntityTypeInfo constraint format to support constraints endpoint -- Added EPIC dataset field derived_dataset_type to UNIQUE_FIELDS_MAP ## v0.0.23 - Add token to validation_utils.get_assaytype_data, replace URL string concatenation with urllib From ee2543b7ef2b98ad9d7763dd8ff544503d0fac5d Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 6 Sep 2024 13:52:41 -0400 Subject: [PATCH 4/7] fixing error message --- src/ingest_validation_tools/validation_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ingest_validation_tools/validation_utils.py b/src/ingest_validation_tools/validation_utils.py index 8365c9aa4..2d062ea03 100644 --- a/src/ingest_validation_tools/validation_utils.py +++ b/src/ingest_validation_tools/validation_utils.py @@ -88,7 +88,9 @@ def get_schema_version( return other_type message = [] if not [field for field in UNIQUE_FIELDS_MAP[DatasetType.DATASET] if field in rows[0].keys()]: - message.append(f"No assay_type or dataset_type in {path}.") + message.append( + f"Required dataset field not present in {path}. One of the following is required: {', '.join(UNIQUE_FIELDS_MAP[DatasetType.DATASET])}" + ) if "channel_id" in rows[0]: message.append('Has "channel_id": Antibodies TSV found where metadata TSV expected.') elif "orcid_id" in rows[0]: From e5382decf1e170e2df122b154cd5ea1f01d7e4c4 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 6 Sep 2024 13:58:19 -0400 Subject: [PATCH 5/7] fixing tests --- examples/dataset-examples/bad-no-assay-type/README.md | 5 +++-- src/ingest_validation_tools/validation_utils.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/dataset-examples/bad-no-assay-type/README.md b/examples/dataset-examples/bad-no-assay-type/README.md index 00f25a7d7..05410254e 100644 --- a/examples/dataset-examples/bad-no-assay-type/README.md +++ b/examples/dataset-examples/bad-no-assay-type/README.md @@ -1,4 +1,5 @@ ``` Preflight Errors: -- No assay_type or dataset_type in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv. -``` \ No newline at end of file +- 'Required dataset field not present in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv. + One of the following is required: assay_type, dataset_type, derived_dataset_type.' +``` diff --git a/src/ingest_validation_tools/validation_utils.py b/src/ingest_validation_tools/validation_utils.py index 2d062ea03..99d13fa0a 100644 --- a/src/ingest_validation_tools/validation_utils.py +++ b/src/ingest_validation_tools/validation_utils.py @@ -89,7 +89,7 @@ def get_schema_version( message = [] if not [field for field in UNIQUE_FIELDS_MAP[DatasetType.DATASET] if field in rows[0].keys()]: message.append( - f"Required dataset field not present in {path}. One of the following is required: {', '.join(UNIQUE_FIELDS_MAP[DatasetType.DATASET])}" + f"Required dataset field not present in {path}. One of the following is required: {', '.join(sorted(UNIQUE_FIELDS_MAP[DatasetType.DATASET]))}" ) if "channel_id" in rows[0]: message.append('Has "channel_id": Antibodies TSV found where metadata TSV expected.') From f475ace85a150f060aa199d4bbb9d02ae0c318c4 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 6 Sep 2024 16:52:01 -0400 Subject: [PATCH 6/7] setting dataset_type for EPIC datasets --- src/ingest_validation_tools/enums.py | 15 ++++++++++++ src/ingest_validation_tools/schema_loader.py | 21 ++++++++++++----- .../validation_utils.py | 23 +++++++------------ 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/ingest_validation_tools/enums.py b/src/ingest_validation_tools/enums.py index 178aeea60..0082f97a0 100644 --- a/src/ingest_validation_tools/enums.py +++ b/src/ingest_validation_tools/enums.py @@ -208,3 +208,18 @@ class Sample(EntityTypes): @classmethod def with_parent_type(cls): return [*[entity_type for entity_type in cls], OtherTypes.SAMPLE] + + +# These should all be considered to be mutually exclusive, +# even within the same type +UNIQUE_FIELDS_MAP = { + OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"}, + OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"}, + DatasetType.DATASET: {"assay_type", "dataset_type"}, + OtherTypes.SOURCE: {"strain_rrid"}, + OtherTypes.ORGAN: {"organ_id"}, # Deprecated? + OtherTypes.SAMPLE: {"sample_id"}, +} +OTHER_FIELDS_UNIQUE_FIELDS_MAP = { + k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET +} diff --git a/src/ingest_validation_tools/schema_loader.py b/src/ingest_validation_tools/schema_loader.py index 8018826d6..fb98d53f8 100644 --- a/src/ingest_validation_tools/schema_loader.py +++ b/src/ingest_validation_tools/schema_loader.py @@ -8,6 +8,7 @@ from typing import Dict, List, Optional, Sequence, Set, Union from ingest_validation_tools.enums import ( + UNIQUE_FIELDS_MAP, DatasetType, EntityTypes, OtherTypes, @@ -91,13 +92,8 @@ def get_row_data(self): self.is_cedar = True else: self.is_cedar = False + self.get_dataset_type_value() self.version = self.rows[0].get("version") - assay_type = self.rows[0].get("assay_type") - dataset_type = self.rows[0].get("dataset_type") - if assay_type is not None and dataset_type is not None: - raise PreflightError(f"Found both assay_type and dataset_type for path {self.path}!") - else: - self.dataset_type = assay_type if assay_type else dataset_type def get_assayclassifier_data(self): self.vitessce_hints = self.soft_assay_data.get("vitessce-hints", []) @@ -109,6 +105,19 @@ def get_assayclassifier_data(self): contains = self.soft_assay_data.get("must-contain", []) self.contains = [schema.lower() for schema in contains] + def get_dataset_type_value(self): + dataset_fields = { + k: v for k, v in self.rows[0].items() if k in UNIQUE_FIELDS_MAP[DatasetType.DATASET] + } + values_found = list(dataset_fields.values()) + if len(values_found) == 0: + return + elif len(values_found) > 1: + raise PreflightError( + f"Found multiple dataset fields for path {self.path}: {dataset_fields}" + ) + self.dataset_type = values_found[0] + @dataclass class EntityTypeInfo: diff --git a/src/ingest_validation_tools/validation_utils.py b/src/ingest_validation_tools/validation_utils.py index 99d13fa0a..d85095d21 100644 --- a/src/ingest_validation_tools/validation_utils.py +++ b/src/ingest_validation_tools/validation_utils.py @@ -12,7 +12,14 @@ DirectoryValidationErrors, validate_directory, ) -from ingest_validation_tools.enums import DatasetType, EntityTypes, OtherTypes, Sample +from ingest_validation_tools.enums import ( + OTHER_FIELDS_UNIQUE_FIELDS_MAP, + UNIQUE_FIELDS_MAP, + DatasetType, + EntityTypes, + OtherTypes, + Sample, +) from ingest_validation_tools.schema_loader import ( EntityTypeInfo, PreflightError, @@ -21,20 +28,6 @@ ) from ingest_validation_tools.table_validator import ReportType -# These should all be considered to be mutually exclusive, -# even within the same type -UNIQUE_FIELDS_MAP = { - OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"}, - OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"}, - DatasetType.DATASET: {"assay_type", "dataset_type", "derived_dataset_type"}, - OtherTypes.SOURCE: {"strain_rrid"}, - OtherTypes.ORGAN: {"organ_id"}, # Deprecated - OtherTypes.SAMPLE: {"sample_id"}, -} -OTHER_FIELDS_UNIQUE_FIELDS_MAP = { - k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET -} - def match_field_in_unique_fields( match_fields: list, path: str, dataset=True From dbc2c417ca9f98b470c83e9d1b3820b6d491165c Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 6 Sep 2024 16:54:17 -0400 Subject: [PATCH 7/7] fixing messed up merge of enums --- src/ingest_validation_tools/enums.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ingest_validation_tools/enums.py b/src/ingest_validation_tools/enums.py index 0082f97a0..d3909b488 100644 --- a/src/ingest_validation_tools/enums.py +++ b/src/ingest_validation_tools/enums.py @@ -215,9 +215,9 @@ def with_parent_type(cls): UNIQUE_FIELDS_MAP = { OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"}, OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"}, - DatasetType.DATASET: {"assay_type", "dataset_type"}, + DatasetType.DATASET: {"assay_type", "dataset_type", "derived_dataset_type"}, OtherTypes.SOURCE: {"strain_rrid"}, - OtherTypes.ORGAN: {"organ_id"}, # Deprecated? + OtherTypes.ORGAN: {"organ_id"}, # Deprecated OtherTypes.SAMPLE: {"sample_id"}, } OTHER_FIELDS_UNIQUE_FIELDS_MAP = {