diff --git a/CHANGELOG.md b/CHANGELOG.md index 8254f3e85..2ea6f15bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Remove bad paths from LC-MS directory schema - Allow multiple comma-separated parent_sample_id values - Accommodate dir schema minor versions +- Fix ORCID URL checking ## v0.0.18 diff --git a/examples/dataset-examples/bad-cedar-assay-histology/README.md b/examples/dataset-examples/bad-cedar-assay-histology/README.md index 82369c3c9..e07c12dc0 100644 --- a/examples/dataset-examples/bad-cedar-assay-histology/README.md +++ b/examples/dataset-examples/bad-cedar-assay-histology/README.md @@ -5,6 +5,9 @@ Upload Errors: column 'contributors_path', value './contributors.tsv' : CEDAR Validation Errors: examples/dataset-examples/bad-cedar-assay-histology/upload/contributors.tsv: + URL Errors: + - 'On row 2, column "orcid", value "0000-0002-8928-abcd" fails because of + error "Exception": ORCID 0000-0002-8928-abcd does not exist.' Validation Errors: - On row 0, column "orcid", value "0000-0002-8928-abcd" fails because of error "invalidValueFormat". @@ -26,4 +29,4 @@ Reference Errors: Hint: 'If validation fails because of extra whitespace in the TSV, try: src/cleanup_whitespace.py --tsv_in original.tsv --tsv_out clean.tsv.' -``` +``` \ No newline at end of file diff --git a/examples/dataset-examples/bad-cedar-assay-histology/fixtures.json b/examples/dataset-examples/bad-cedar-assay-histology/fixtures.json index 81e9098a0..2b219bdc5 100644 --- a/examples/dataset-examples/bad-cedar-assay-histology/fixtures.json +++ b/examples/dataset-examples/bad-cedar-assay-histology/fixtures.json @@ -1 +1 @@ -{"assaytype": {"Histology": {"assaytype": "h-and-e", "contains-pii": false, "dataset-type": "Histology", "description": "H&E Stained Microscopy", "dir-schema": "histology-v2", "primary": true, "vitessce-hints": []}}, "validation": {"h-and-e": {"URL Errors": ["On row 2, column \"parent_sample_id\", value \"wrong\" fails because of error \"HTTPError\": 400 Client Error: Bad Request for url: https://entity.api.hubmapconsortium.org/entities/wrong"], "Validation Errors": ["On row 0, column \"parent_sample_id\", value \"wrong\" fails because of error \"invalidValueFormat\"", "On row 1, column \"contributors_path\", value \"\" fails because of error \"missingRequired\""]}, "contributors": {"Validation Errors": ["On row 0, column \"orcid\", value \"0000-0002-8928-abcd\" fails because of error \"invalidValueFormat\""]}}} \ No newline at end of file +{"assaytype": {"Histology": {"assaytype": "h-and-e", "contains-pii": false, "dataset-type": "Histology", "description": "H&E Stained Microscopy", "dir-schema": "histology-v2", "primary": true, "vitessce-hints": []}}, "validation": {"h-and-e": {"URL Errors": ["On row 2, column \"parent_sample_id\", value \"wrong\" fails because of error \"HTTPError\": 400 Client Error: Bad Request for url: https://entity.api.hubmapconsortium.org/entities/wrong"], "Validation Errors": ["On row 0, column \"parent_sample_id\", value \"wrong\" fails because of error \"invalidValueFormat\"", "On row 1, column \"contributors_path\", value \"\" fails because of error \"missingRequired\""]}, "contributors": {"URL Errors": ["On row 2, column \"orcid\", value \"0000-0002-8928-abcd\" fails because of error \"Exception\": ORCID 0000-0002-8928-abcd does not exist."], "Validation Errors": ["On row 0, column \"orcid\", value \"0000-0002-8928-abcd\" fails because of error \"invalidValueFormat\""]}}} \ No newline at end of file diff --git a/examples/dataset-examples/bad-cedar-dir-histology/upload/contributors.tsv b/examples/dataset-examples/bad-cedar-dir-histology/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-examples/bad-cedar-dir-histology/upload/contributors.tsv +++ b/examples/dataset-examples/bad-cedar-dir-histology/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-examples/bad-cedar-multi-assay-visium-bad-child-metadata/upload/contributors.tsv b/examples/dataset-examples/bad-cedar-multi-assay-visium-bad-child-metadata/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-examples/bad-cedar-multi-assay-visium-bad-child-metadata/upload/contributors.tsv +++ b/examples/dataset-examples/bad-cedar-multi-assay-visium-bad-child-metadata/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-examples/bad-cedar-multi-assay-visium-bad-dir-structure/upload/contributors.tsv b/examples/dataset-examples/bad-cedar-multi-assay-visium-bad-dir-structure/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-examples/bad-cedar-multi-assay-visium-bad-dir-structure/upload/contributors.tsv +++ b/examples/dataset-examples/bad-cedar-multi-assay-visium-bad-dir-structure/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-examples/bad-cedar-multi-assay-visium-missing-child/upload/contributors.tsv b/examples/dataset-examples/bad-cedar-multi-assay-visium-missing-child/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-examples/bad-cedar-multi-assay-visium-missing-child/upload/contributors.tsv +++ b/examples/dataset-examples/bad-cedar-multi-assay-visium-missing-child/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-examples/bad-cedar-multi-assay-visium-unreferenced-parent-path/upload/contributors.tsv b/examples/dataset-examples/bad-cedar-multi-assay-visium-unreferenced-parent-path/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-examples/bad-cedar-multi-assay-visium-unreferenced-parent-path/upload/contributors.tsv +++ b/examples/dataset-examples/bad-cedar-multi-assay-visium-unreferenced-parent-path/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-examples/bad-cedar-multi-assay-visium-with-standalone-histology/upload/contributors.tsv b/examples/dataset-examples/bad-cedar-multi-assay-visium-with-standalone-histology/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-examples/bad-cedar-multi-assay-visium-with-standalone-histology/upload/contributors.tsv +++ b/examples/dataset-examples/bad-cedar-multi-assay-visium-with-standalone-histology/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-examples/bad-cedar-multi-assay-visium-wrong-child/upload/contributors.tsv b/examples/dataset-examples/bad-cedar-multi-assay-visium-wrong-child/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-examples/bad-cedar-multi-assay-visium-wrong-child/upload/contributors.tsv +++ b/examples/dataset-examples/bad-cedar-multi-assay-visium-wrong-child/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-examples/good-cedar-histology/upload/contributors.tsv b/examples/dataset-examples/good-cedar-histology/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-examples/good-cedar-histology/upload/contributors.tsv +++ b/examples/dataset-examples/good-cedar-histology/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-examples/good-cedar-multi-assay-visium/upload/contributors.tsv b/examples/dataset-examples/good-cedar-multi-assay-visium/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-examples/good-cedar-multi-assay-visium/upload/contributors.tsv +++ b/examples/dataset-examples/good-cedar-multi-assay-visium/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/contributors.tsv b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/contributors.tsv index f217dee66..971253dfd 100644 --- a/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/contributors.tsv +++ b/examples/dataset-iec-examples/good-cedar-phenocycler-shared/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Juan Puerto Juan Puerto PSC 0000-0002-8928-741X Yes No No 94dae6f8-0756-4ab0-a47b-138e446a9501 \ No newline at end of file +Juan Puerto Juan Puerto PSC 0000-0002-1825-0097 Yes No No 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/dataset-iec-examples/good-example/upload/extras/contributors.tsv b/examples/dataset-iec-examples/good-example/upload/extras/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/dataset-iec-examples/good-example/upload/extras/contributors.tsv +++ b/examples/dataset-iec-examples/good-example/upload/extras/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/examples/plugin-tests/expected-failure/upload/contributors.tsv b/examples/plugin-tests/expected-failure/upload/contributors.tsv index 387e13c11..0feb19746 100644 --- a/examples/plugin-tests/expected-failure/upload/contributors.tsv +++ b/examples/plugin-tests/expected-failure/upload/contributors.tsv @@ -1,2 +1,2 @@ first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id -Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 +Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 test@test.com Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501 diff --git a/src/ingest_validation_tools/upload.py b/src/ingest_validation_tools/upload.py index 6ec291811..d3650007d 100644 --- a/src/ingest_validation_tools/upload.py +++ b/src/ingest_validation_tools/upload.py @@ -330,7 +330,7 @@ def _validate( logging.info(f"{tsv_path}: Offline validation selected, cannot reach API.") return errors else: - api_errors = self.online_checks(tsv_path, schema_version.schema_name, report_type) + api_errors = self.online_checks(tsv_path, schema_version, report_type) if api_errors: api_validated[f"{tsv_path}"] = api_errors if local_validated: @@ -342,10 +342,10 @@ def _validate( def online_checks( self, tsv_path: str, - schema_name: str, + schema: SchemaVersion, report_type: ReportType = ReportType.STR, ) -> Optional[Dict]: - url_errors = self._url_checks(tsv_path, schema_name, report_type) + url_errors = self._url_checks(tsv_path, schema, report_type) api_errors = self._api_validation(Path(tsv_path), report_type) if url_errors or api_errors: return url_errors | api_errors @@ -526,21 +526,22 @@ def _cedar_api_call(self, tsv_path: Union[str, Path]) -> requests.models.Respons return response def _url_checks( - self, tsv_path: str, schema_name: str, report_type: ReportType = ReportType.STR + self, tsv_path: str, schema: SchemaVersion, report_type: ReportType = ReportType.STR ): """ - Check provided UUIDs/HuBMAP IDs for parent_id, sample_id, organ_id. - Not using get_table_errors because CEDAR schema fields do not match - the TSV fields, which makes frictionless confused and upset. + Check provided values for parent_sample_id and orcid_id; additionally + check sample_id, organ_id, and source_id values in single TSV validation + via validation_utils.get_tsv_errors. """ errors: Dict = {} # assay -> parent_sample_id # sample -> sample_id # organ -> organ_id - # contributors -> orcid_id + # contributors -> orcid (new) / orcid_id (old) constrained_fields = {} + schema_name = schema.schema_name if "sample" in schema_name: constrained_fields["sample_id"] = self.app_context.get("entities_url") @@ -549,7 +550,14 @@ def _url_checks( elif "murine-source" in schema_name: constrained_fields["source_id"] = self.app_context.get("entities_url") elif "contributors" in schema_name: - constrained_fields["orcid_id"] = "https://pub.orcid.org/v3.0/" + if schema.is_cedar: + constrained_fields["orcid"] = ( + "https://pub.orcid.org/v3.0/expanded-search/?q=orcid:" + ) + else: + constrained_fields["orcid_id"] = ( + "https://pub.orcid.org/v3.0/expanded-search/?q=orcid:" + ) else: constrained_fields["parent_sample_id"] = self.app_context.get("entities_url") @@ -593,13 +601,21 @@ def _check_single_url( ) -> Optional[Dict]: try: url = constrained_fields[field] + value - if field != "orcid_id": + if field not in ["orcid_id", "orcid"]: headers = self.app_context.get("request_header", {}) headers["Authorization"] = f"Bearer {self.globus_token}" + response = requests.get(url, headers=headers) + response.raise_for_status() else: - headers = {} - response = requests.get(url, headers=headers) - response.raise_for_status() + headers = {"Accept": "application/json"} + response = requests.get(url, headers=headers) + num_found = response.json().get("num-found") + if num_found == 1: + return + elif num_found == 0: + raise Exception(f"ORCID {value} does not exist.") + else: + raise Exception(f"Found {num_found} matches for ORCID {value}.") except Exception as e: error = { "errorType": type(e).__name__, diff --git a/tests-manual/update_test_data.py b/tests-manual/update_test_data.py index 37e942303..7f9c3e994 100644 --- a/tests-manual/update_test_data.py +++ b/tests-manual/update_test_data.py @@ -10,6 +10,7 @@ from ingest_validation_tools.cli_utils import dir_path from ingest_validation_tools.error_report import ErrorReport +from ingest_validation_tools.schema_loader import SchemaVersion from ingest_validation_tools.table_validator import ReportType from ingest_validation_tools.upload import Upload from ingest_validation_tools.validation_utils import TSVError, read_rows @@ -70,7 +71,12 @@ def update_test_data(self) -> Dict[str, List]: raise Exception( f"Something went wrong with Spreadsheet Validator request for {self.dir}." ) - elif "Unauthorized" in report.as_md(): + # Necessary to avoid including 'Unauthorized' in output when no Globus token is provided, + # but only relevant for entity-api links. + elif ( + "No token" in report.as_md() + or "Unauthorized for url: https://entity.api" in report.as_md() + ): raise Exception( f"URL checking returned 'Unauthorized' in response while checking {self.dir}; did you forget a Globus token?" ) @@ -165,7 +171,7 @@ def update_fixtures(self, report: ErrorReport) -> Dict: continue if schema.is_cedar: new_validation_data[schema.schema_name] = self.upload.online_checks( - path, schema.schema_name, ReportType.STR + path, schema, ReportType.STR ) for other_type, paths in { "antibodies": schema.antibodies_paths, @@ -185,7 +191,11 @@ def update_fixtures(self, report: ErrorReport) -> Dict: def other_type_online_checks(self, path: str, other_type: str) -> Dict: metadata_schema_id = read_rows(Path(path), "ascii")[0].get("metadata_schema_id") if metadata_schema_id: - return {Path(path).stem: self.upload.online_checks(path, other_type, ReportType.STR)} + return { + Path(path).stem: self.upload.online_checks( + path, SchemaVersion(schema_name=other_type, is_cedar=True), ReportType.STR + ) + } return {} def open_or_create_fixtures(self) -> Dict: diff --git a/tests/test_dataset_examples.py b/tests/test_dataset_examples.py index c38c75d39..052982a31 100644 --- a/tests/test_dataset_examples.py +++ b/tests/test_dataset_examples.py @@ -10,6 +10,7 @@ from unittest.mock import Mock, call, patch from ingest_validation_tools.error_report import ErrorReport +from ingest_validation_tools.schema_loader import SchemaVersion from ingest_validation_tools.upload import Upload from .fixtures import ( @@ -137,10 +138,10 @@ def _open_and_read_fixtures_file(path: str) -> Dict: return opened -def _online_side_effect(schema_name: str, dir_path: str, *args): +def _online_side_effect(schema: SchemaVersion, dir_path: str, *args): del args fixture = _open_and_read_fixtures_file(dir_path) - return fixture.get("validation", {}).get(schema_name, {}) + return fixture.get("validation", {}).get(schema.schema_name, {}) def _assaytype_side_effect(path: str, row: Dict, *args, **kwargs): @@ -206,8 +207,8 @@ def test_validate_dataset_examples(self, verbose: bool = False): ) as mock_assaytype_data: with patch( "ingest_validation_tools.upload.Upload.online_checks", - side_effect=lambda tsv_path, schema_name, report_type: _online_side_effect( - schema_name, test_dir, tsv_path, report_type + side_effect=lambda tsv_path, schema, report_type: _online_side_effect( + schema, test_dir, tsv_path, report_type ), ): try: @@ -266,8 +267,8 @@ def prep_upload(self, test_dir: str, opts: Dict, patch_data: Dict): ): with patch( "ingest_validation_tools.upload.Upload.online_checks", - side_effect=lambda tsv_path, schema_name, report_type: _online_side_effect( - schema_name, test_dir, tsv_path, report_type + side_effect=lambda tsv_path, schema, report_type: _online_side_effect( + schema, test_dir, tsv_path, report_type ), ): with patch(