From 0ed60d63617a8ae25d29ebf39b0f8485e4c7ab5a Mon Sep 17 00:00:00 2001 From: jpuerto-psc <68066250+jpuerto-psc@users.noreply.github.com> Date: Mon, 25 Mar 2024 10:02:55 -0400 Subject: [PATCH] Jpuerto/nihdev 454 ivt adjust dir schema validation for shared uploads (#1290) * General: Add pseudo-code * General: Update CHANGELOG * General: Initial approach for shared directory support * General: non-global != non_global * General: Comments for changes * General: Add example for shared uploads * General: Resolve changelog conflict with main * General: Ignore multi_ref if it is a shared upload * General: Change how plugin_validation functions for shared uploads * General: Fix typing issue * General: Remove TODO - handled deeper than where it was placed. * General: Update upload.py to check for non_global_file existence * General: Fix upload.py * General: Update docs/tests * General: Change error messaging to be more clear about where the error is. * General: Change error messaging to be more clear about where the error is. * General: Update syntax to pass formatting checks * Plugins: Update paths passed to be global and non_global * Upload: Change shared_directories to just be a bool, we don't use the actual stored values anywhere. * Directory Validation: Fix error message for paths * General: Line length change * Directory Validation: Fix import * General: Formatting changes * General: Remove unused functions * General: Reformatting * General: Formatting changes * Tests: Update tests --------- Co-authored-by: Juan Puerto <=> --- CHANGELOG.md | 3 + .../bad-cedar-dir-histology/README.md | 4 +- .../MOCK_RESPONSE.json | 1 + .../good-cedar-phenocycler-shared/README.md | 12 ++++ .../README_ONLINE.md | 3 + .../fixtures.json | 1 + .../upload/antibodies.tsv | 2 + .../upload/contributors.tsv | 2 + .../global/extras/microscope_hardware.json | 0 .../global/extras/microscope_settings.json | 0 .../images/OL9.ome-tiff.channels.csv | 0 .../global/lab_processed/images/OL9.ome.tiff | 0 .../upload/global/raw/images/OL9.qptiff | 0 .../upload/global/raw/images/OL9.xpd | 0 .../raw/images/phenocycler/FocusMap.tif | 0 .../phenocycler/OL9_Scan1_Cycle0.qptiff.raw | 0 .../phenocycler/OL9_Scan1_Cycle1.qptiff.raw | 0 .../phenocycler/OL9_Scan1_Cycle2.qptiff.raw | 0 .../phenocycler/OL9_Scan1_Cycle3.qptiff.raw | 0 .../phenocycler/OL9_Scan1_Cycle4.qptiff.raw | 0 .../upload/metadata.tsv | 2 + .../images/1-tissue-boundary.geojson | 0 .../images/2-tissue-boundary.geojson | 0 .../images/3-tissue-boundary.geojson | 0 .../directory_validator.py | 41 +++++++------ .../plugin_validator.py | 28 ++++++--- src/ingest_validation_tools/upload.py | 61 +++++++++++++++++-- .../validation_utils.py | 27 ++++++-- 28 files changed, 149 insertions(+), 38 deletions(-) create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/MOCK_RESPONSE.json create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/README.md create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/README_ONLINE.md create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/fixtures.json create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/antibodies.tsv create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/contributors.tsv create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/extras/microscope_hardware.json create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/extras/microscope_settings.json create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/lab_processed/images/OL9.ome-tiff.channels.csv create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/lab_processed/images/OL9.ome.tiff create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/OL9.qptiff create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/OL9.xpd create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/FocusMap.tif create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle0.qptiff.raw create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle1.qptiff.raw create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle2.qptiff.raw create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle3.qptiff.raw create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle4.qptiff.raw create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/metadata.tsv create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/1-tissue-boundary.geojson create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/2-tissue-boundary.geojson create mode 100644 examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/3-tissue-boundary.geojson diff --git a/CHANGELOG.md b/CHANGELOG.md index f9100e19c..09ed8718b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## v0.0.19 (in progress) +- Directory validation changes for "shared" uploads + ## v0.0.18 - Update PhenoCycler directory schema diff --git a/examples/dataset-examples/bad-cedar-dir-histology/README.md b/examples/dataset-examples/bad-cedar-dir-histology/README.md index 1dd13c01d..4401ceadc 100644 --- a/examples/dataset-examples/bad-cedar-dir-histology/README.md +++ b/examples/dataset-examples/bad-cedar-dir-histology/README.md @@ -18,7 +18,7 @@ Upload Errors: - lab_processed\/images\/[^\/]+\.ome\.tiff. - raw\/.*. - raw\/images\/.*. - - raw\/images\/[^\/]+\.(?:xml|scn|vsi|ndpi|svs|czi|tiff). + - raw\/images\/[^\/]+\.(?:xml|scn|vsi|ndpi|svs|czi|tiff|qptiff). Reference Errors: No References: Files: @@ -26,4 +26,4 @@ Reference Errors: Hint: 'If validation fails because of extra whitespace in the TSV, try: src/cleanup_whitespace.py --tsv_in original.tsv --tsv_out clean.tsv.' -``` +``` \ No newline at end of file diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/MOCK_RESPONSE.json b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/MOCK_RESPONSE.json new file mode 100644 index 000000000..b901d8b99 --- /dev/null +++ b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/MOCK_RESPONSE.json @@ -0,0 +1 @@ +{"PhenoCycler": {"args": ["HBM894.RZFJ.749", "", "https://dx.doi.org/10.17504/protocols.io.eq2lyno9qvx9/v1", "PhenoCycler", "DNA", "Yes", "Andor", "Axio Observer 7", "1", "day", "", "", "./contributors.tsv", "./one", "./antibodies.tsv", "11", "hour", "1", "1", "1", "1", "", "NAKATPASE", "DAPI", "./lab_processed/images/1-tissue-boundary.geojson; ./lab_processed/images/2-tissue-boundary.geojson", "62af6829-743d-423e-a701-204710e56beb"], "response": {"assaytype": "phenocycler", "contains-pii": false, "dataset-type": "PhenoCycler", "description": "PhenoCycler", "dir-schema": "phenocycler-v2", "primary": true, "vitessce-hints": []}}} \ No newline at end of file diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/README.md b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/README.md new file mode 100644 index 000000000..5c0ae8902 --- /dev/null +++ b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/README.md @@ -0,0 +1,12 @@ +``` +No errors! +Time: 2024-03-25 09:45:27.701080 +Git version: 97b824f +Directory: examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload +TSVs: + metadata.tsv: + Schema: phenocycler-v2 + Metadata schema version: '2' + Directory schema versions: phenocycler-v2 + +``` \ No newline at end of file diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/README_ONLINE.md b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/README_ONLINE.md new file mode 100644 index 000000000..91c0d2d94 --- /dev/null +++ b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/README_ONLINE.md @@ -0,0 +1,3 @@ +Reference Errors: Shared Directory References: examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/1-tissue-boundary.geojson; ./lab_processed/images/2-tissue-boundary.geojson: Does not exist in upload. +Hint: If validation fails because of extra whitespace in the TSV, try: +src/cleanup_whitespace.py --tsv_in original.tsv --tsv_out clean.tsv. diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/fixtures.json b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/fixtures.json new file mode 100644 index 000000000..ba405e888 --- /dev/null +++ b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/fixtures.json @@ -0,0 +1 @@ +{"assaytype": {"PhenoCycler": {"assaytype": "phenocycler", "contains-pii": false, "dataset-type": "PhenoCycler", "description": "PhenoCycler", "dir-schema": "phenocycler-v2", "primary": true, "vitessce-hints": []}}, "validation": {"phenocycler": null, "antibodies": null, "contributors": null}} \ No newline at end of file diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/antibodies.tsv b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/antibodies.tsv new file mode 100644 index 000000000..63c8a5bce --- /dev/null +++ b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/antibodies.tsv @@ -0,0 +1,2 @@ +channel_id hgnc_symbol antibody_rrid uniprot_accession_number lot_number dilution_factor antibody_concentration_value antibody_concentration_unit conjugated_cat_number conjugated_tag metadata_schema_id +1 ID2B AB_10002075 Q9NNX6 GR3238979-1 312f7be0-9aec-4cae-b942-a8864c0aa1ce \ No newline at end of file diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/contributors.tsv b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/contributors.tsv new file mode 100644 index 000000000..f217dee66 --- /dev/null +++ b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/contributors.tsv @@ -0,0 +1,2 @@ +first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id +Juan Puerto Juan Puerto PSC 0000-0002-8928-741X Yes No No 94dae6f8-0756-4ab0-a47b-138e446a9501 \ No newline at end of file diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/extras/microscope_hardware.json b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/extras/microscope_hardware.json new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/extras/microscope_settings.json b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/extras/microscope_settings.json new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/lab_processed/images/OL9.ome-tiff.channels.csv b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/lab_processed/images/OL9.ome-tiff.channels.csv new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/lab_processed/images/OL9.ome.tiff b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/lab_processed/images/OL9.ome.tiff new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/OL9.qptiff b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/OL9.qptiff new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/OL9.xpd b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/OL9.xpd new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/FocusMap.tif b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/FocusMap.tif new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle0.qptiff.raw b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle0.qptiff.raw new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle1.qptiff.raw b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle1.qptiff.raw new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle2.qptiff.raw b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle2.qptiff.raw new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle3.qptiff.raw b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle3.qptiff.raw new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle4.qptiff.raw b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/global/raw/images/phenocycler/OL9_Scan1_Cycle4.qptiff.raw new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/metadata.tsv b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/metadata.tsv new file mode 100644 index 000000000..4ad7ade1e --- /dev/null +++ b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/metadata.tsv @@ -0,0 +1,2 @@ +parent_sample_id lab_id preparation_protocol_doi dataset_type analyte_class is_targeted acquisition_instrument_vendor acquisition_instrument_model source_storage_duration_value source_storage_duration_unit time_since_acquisition_instrument_calibration_value time_since_acquisition_instrument_calibration_unit contributors_path data_path antibodies_path total_run_time_value total_run_time_unit number_of_antibodies number_of_channels number_of_biomarker_imaging_rounds number_of_total_imaging_rounds slide_id cell_boundary_marker_or_stain nuclear_marker_or_stain non_global_files metadata_schema_id +HBM894.RZFJ.749 https://dx.doi.org/10.17504/protocols.io.eq2lyno9qvx9/v1 PhenoCycler DNA Yes Andor Axio Observer 7 1 day ./contributors.tsv ./one ./antibodies.tsv 11 hour 1 1 1 1 NAKATPASE DAPI ./lab_processed/images/1-tissue-boundary.geojson; ./lab_processed/images/2-tissue-boundary.geojson 62af6829-743d-423e-a701-204710e56beb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/1-tissue-boundary.geojson b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/1-tissue-boundary.geojson new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/2-tissue-boundary.geojson b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/2-tissue-boundary.geojson new file mode 100644 index 000000000..e69de29bb diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/3-tissue-boundary.geojson b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/non_global/lab_processed/images/3-tissue-boundary.geojson new file mode 100644 index 000000000..e69de29bb diff --git a/src/ingest_validation_tools/directory_validator.py b/src/ingest_validation_tools/directory_validator.py index 3e21606c3..78da3c0af 100644 --- a/src/ingest_validation_tools/directory_validator.py +++ b/src/ingest_validation_tools/directory_validator.py @@ -13,7 +13,7 @@ def __init__(self, errors): def validate_directory( - path: Path, schema_files: List[Dict], dataset_ignore_globs: List[str] = [] + paths: list[Path], schema_files: List[Dict], dataset_ignore_globs: List[str] = [] ) -> None: """ Given a directory path, and a directory schema, @@ -23,25 +23,32 @@ def validate_directory( required_patterns, allowed_patterns = _get_required_allowed(schema_files) dependencies = _get_dependencies(schema_files) except KeyError as e: - raise DirectoryValidationErrors(f"Error finding patterns for {path}: {e}") + raise DirectoryValidationErrors( + f"Error finding patterns" f" for {','.join([x.as_posix() for x in paths])}: {e}" + ) required_missing_errors: List[str] = [] not_allowed_errors: List[str] = [] - if not path.exists(): - raise FileNotFoundError(0, "No such file or directory", str(path)) actual_paths = [] - for triple in os.walk(path): - (dir_path, _, file_names) = triple - # [1:] removes leading '/', if any. - prefix = dir_path.replace(str(path), "")[1:] - if os.name == "nt": - # Convert MS backslashes to forward slashes. - prefix = prefix.replace("\\", "/") - # If this is not the root of the path and is a leaf directory - if not file_names and prefix: - actual_paths += [f"{prefix}/"] - # Otherwise this should be a branch directory - else: - actual_paths += [f"{prefix}/{name}" for name in file_names] if prefix else file_names + + for path in paths: + if not path.exists(): + raise FileNotFoundError(0, "No such file or directory", str(path)) + + for triple in os.walk(path): + (dir_path, _, file_names) = triple + # [1:] removes leading '/', if any. + prefix = dir_path.replace(str(path), "")[1:] + if os.name == "nt": + # Convert MS backslashes to forward slashes. + prefix = prefix.replace("\\", "/") + # If this is not the root of the path and is a leaf directory + if not file_names and prefix: + actual_paths += [f"{prefix}/"] + # Otherwise this should be a branch directory + else: + actual_paths += ( + [f"{prefix}/{name}" for name in file_names] if prefix else file_names + ) """TODO: message_munger adds periods at the end of these messages which is very confusing for regex! Also human readability of required_patterns diff --git a/src/ingest_validation_tools/plugin_validator.py b/src/ingest_validation_tools/plugin_validator.py index 4ac3665d1..f0422db52 100644 --- a/src/ingest_validation_tools/plugin_validator.py +++ b/src/ingest_validation_tools/plugin_validator.py @@ -91,6 +91,7 @@ def run_plugin_validators_iter( metadata_path: PathOrStr, sv: SchemaVersion, plugin_dir: PathOrStr, + is_shared_upload: bool, verbose: bool = True, **kwargs, ) -> Iterator[KeyValuePair]: @@ -111,20 +112,27 @@ def run_plugin_validators_iter( raise ValidatorError(f"{metadata_path} contains more than one assay type") data_paths = [] - if all("data_path" in row for row in sv.rows): - for row in sv.rows: - data_path = Path(row["data_path"]) - if not data_path.is_absolute(): - data_path = Path(metadata_path).parent / data_path - if not data_path.is_dir(): - raise ValidatorError(f"{data_path} should be the base directory of a dataset") - data_paths.append(data_path) + if is_shared_upload: + paths = [Path(metadata_path).parent / "global", Path(metadata_path).parent / "non_global"] for k, v in validation_error_iter( - data_paths, sv.dataset_type, plugin_dir, sv.contains, verbose=verbose, **kwargs + paths, sv.dataset_type, plugin_dir, sv.contains, **kwargs ): yield k, v else: - raise ValidatorError(f"{metadata_path} is missing values in 'data_path' column") + if all("data_path" in row for row in sv.rows): + for row in sv.rows: + data_path = Path(row["data_path"]) + if not data_path.is_absolute(): + data_path = Path(metadata_path).parent / data_path + if not data_path.is_dir(): + raise ValidatorError(f"{data_path} should be the base directory of a dataset") + data_paths.append(data_path) + for k, v in validation_error_iter( + data_paths, sv.dataset_type, plugin_dir, sv.contains, verbose=verbose, **kwargs + ): + yield k, v + else: + raise ValidatorError(f"{metadata_path} is missing values in 'data_path' column") def validation_class_iter(plugin_dir: PathOrStr) -> Iterator[Type[Validator]]: diff --git a/src/ingest_validation_tools/upload.py b/src/ingest_validation_tools/upload.py index 3602b57de..c662de19e 100644 --- a/src/ingest_validation_tools/upload.py +++ b/src/ingest_validation_tools/upload.py @@ -99,6 +99,12 @@ def __init__( if not self.is_multi_assay: self._check_single_assay() + self.is_shared_upload = {"global", "non_global"} == { + x + for x in self.directory_path.glob("*global") + if x.is_dir() and x.name in ["global", "non_global"] + } + except PreflightError as e: self.errors["Preflight"] = e @@ -343,10 +349,13 @@ def _get_reference_errors(self) -> dict: errors: Dict[str, Any] = {} no_ref_errors = self.__get_no_ref_errors() multi_ref_errors = self.__get_multi_ref_errors() + shared_dir_errors = self.__get_shared_dir_errors() if no_ref_errors: errors["No References"] = no_ref_errors if multi_ref_errors: errors["Multiple References"] = multi_ref_errors + if shared_dir_errors: + errors["Shared Directory References"] = shared_dir_errors return errors def _get_plugin_errors(self, **kwargs) -> dict: @@ -362,7 +371,12 @@ def _get_plugin_errors(self, **kwargs) -> dict: # non-parent dataset_types if not self.multi_parent or (sv.dataset_type == self.multi_parent.dataset_type): for k, v in run_plugin_validators_iter( - metadata_path, sv, plugin_path, verbose=self.verbose, **kwargs + metadata_path, + sv, + plugin_path, + self.is_shared_upload, + verbose=self.verbose, + **kwargs, ): errors[k].append(v) except PluginValidatorError as e: @@ -654,17 +668,22 @@ def _get_ref_errors( def _check_data_path( self, schema_version: SchemaVersion, metadata_path: Path, path_value: str ): + data_path = Path(path_value) errors = {} - data_path = self.directory_path / path_value + if not schema_version.dir_schema: raise Exception( - f"No directory schema found for data_path {data_path} in {metadata_path}!" + f"No directory schema found for data_path " + f"{self.directory_path / data_path} in {metadata_path}!" ) + ref_errors = get_data_dir_errors( schema_version.dir_schema, - data_path, + root_path=self.directory_path, + data_dir_path=data_path, dataset_ignore_globs=self.dataset_ignore_globs, ) + if ref_errors: errors[f"{str(metadata_path)}, column 'data_path', value '{path_value}'"] = ref_errors return errors @@ -719,6 +738,35 @@ def __get_no_ref_errors(self) -> dict: errors["Files"] = unreferenced_file_paths return errors + def __get_shared_dir_errors(self) -> dict: + errors = {} + all_non_global_files = self.__get_non_global_files_references() + if all_non_global_files: + for row_non_global_files, row_references in all_non_global_files.items(): + row_non_global_files = { + (self.directory_path / "./non_global" / Path(x.strip())): Path(x.strip()) + for x in row_non_global_files.split(";") + if x.strip() + } + + for ( + full_path_row_non_global_file, + rel_path_row_non_global_file, + ) in row_non_global_files.items(): + if not full_path_row_non_global_file.exists(): + errors[",".join(row_references)] = ( + f"{rel_path_row_non_global_file} not exist in upload." + ) + else: + # Catch case 2 + if self.is_shared_upload: + errors["Upload Errors"] = ( + "No non_global_files specified but " + "upload has global & non_global directories" + ) + + return errors + def __get_multi_ref_errors(self) -> dict: # If - multi-assay dataset (and only that dataset is referenced) don't fail # Else - fail @@ -726,7 +774,7 @@ def __get_multi_ref_errors(self) -> dict: data_references = self.__get_data_references() for path, references in data_references.items(): if path not in self.multi_assay_data_paths: - if len(references) > 1: + if len(references) > 1 and not self.is_shared_upload: errors[path] = references return errors @@ -739,6 +787,9 @@ def __get_antibodies_references(self) -> dict: def __get_contributors_references(self) -> dict: return self.__get_references("contributors_path") + def __get_non_global_files_references(self) -> dict: + return self.__get_references("non_global_files") + def __get_references(self, col_name) -> dict: references = defaultdict(list) for tsv_path, schema in self.effective_tsv_paths.items(): diff --git a/src/ingest_validation_tools/validation_utils.py b/src/ingest_validation_tools/validation_utils.py index 346a25698..5448ec6a7 100644 --- a/src/ingest_validation_tools/validation_utils.py +++ b/src/ingest_validation_tools/validation_utils.py @@ -155,12 +155,29 @@ def read_rows(path: Path, encoding: str) -> List: def get_data_dir_errors( dir_schema: str, - data_path: Path, + root_path: Path, + data_dir_path: Path, dataset_ignore_globs: List[str] = [], ) -> Optional[dict]: """ Validate a single data_path. """ + expected_shared_directories = {"global", "non_global"} + # Create the most common data_path + data_paths = [root_path / data_dir_path] + + # Check to see whether the shared upload directories exist + shared_directories = { + x for x in root_path.glob("*") if x.is_dir() and x.name in expected_shared_directories + } + + # Iterate over the set of paths and ensure that the names of those paths + # matches the set of expected shared directories above + if {x.name for x in shared_directories} == expected_shared_directories: + # If they exist create a list of the paths + data_paths = list(shared_directories) + # Otherwise, do nothing we can just use the predefine data_path + schema = get_directory_schema(dir_schema=dir_schema) if schema is None: @@ -173,20 +190,22 @@ def get_data_dir_errors( else None ) + printable_data_paths = [x.as_posix() for x in data_paths] + try: - validate_directory(data_path, schema["files"], dataset_ignore_globs=dataset_ignore_globs) + validate_directory(data_paths, schema["files"], dataset_ignore_globs=dataset_ignore_globs) except DirectoryValidationErrors as e: # If there are DirectoryValidationErrors and the schema is deprecated/draft... # schema deprecation/draft status is more important. if schema_warning: return schema_warning errors = {} - errors[f"{data_path} (as {dir_schema})"] = e.errors + errors[f"{','.join(printable_data_paths)} (as {dir_schema})"] = e.errors return errors except OSError as e: # If there are OSErrors and the schema is deprecated/draft... # the OSErrors are more important. - return {f"{data_path} (as {dir_schema})": {e.strerror: e.filename}} + return {f"{','.join(printable_data_paths)} (as {dir_schema})": {e.strerror: e.filename}} if schema_warning: return schema_warning