Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ORCID bugfix #1322

Merged
merged 2 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- Remove bad paths from LC-MS directory schema
- Allow multiple comma-separated parent_sample_id values
- Accommodate dir schema minor versions
- Fix ORCID URL checking

## v0.0.18

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ Upload Errors:
column 'contributors_path', value './contributors.tsv'
: CEDAR Validation Errors:
examples/dataset-examples/bad-cedar-assay-histology/upload/contributors.tsv:
URL Errors:
- 'On row 2, column "orcid", value "0000-0002-8928-abcd" fails because of
error "Exception": ORCID 0000-0002-8928-abcd does not exist.'
Validation Errors:
- On row 0, column "orcid", value "0000-0002-8928-abcd" fails because of
error "invalidValueFormat".
Expand All @@ -26,4 +29,4 @@ Reference Errors:
Hint: 'If validation fails because of extra whitespace in the TSV, try:

src/cleanup_whitespace.py --tsv_in original.tsv --tsv_out clean.tsv.'
```
```
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"assaytype": {"Histology": {"assaytype": "h-and-e", "contains-pii": false, "dataset-type": "Histology", "description": "H&E Stained Microscopy", "dir-schema": "histology-v2", "primary": true, "vitessce-hints": []}}, "validation": {"h-and-e": {"URL Errors": ["On row 2, column \"parent_sample_id\", value \"wrong\" fails because of error \"HTTPError\": 400 Client Error: Bad Request for url: https://entity.api.hubmapconsortium.org/entities/wrong"], "Validation Errors": ["On row 0, column \"parent_sample_id\", value \"wrong\" fails because of error \"invalidValueFormat\"", "On row 1, column \"contributors_path\", value \"\" fails because of error \"missingRequired\""]}, "contributors": {"Validation Errors": ["On row 0, column \"orcid\", value \"0000-0002-8928-abcd\" fails because of error \"invalidValueFormat\""]}}}
{"assaytype": {"Histology": {"assaytype": "h-and-e", "contains-pii": false, "dataset-type": "Histology", "description": "H&E Stained Microscopy", "dir-schema": "histology-v2", "primary": true, "vitessce-hints": []}}, "validation": {"h-and-e": {"URL Errors": ["On row 2, column \"parent_sample_id\", value \"wrong\" fails because of error \"HTTPError\": 400 Client Error: Bad Request for url: https://entity.api.hubmapconsortium.org/entities/wrong"], "Validation Errors": ["On row 0, column \"parent_sample_id\", value \"wrong\" fails because of error \"invalidValueFormat\"", "On row 1, column \"contributors_path\", value \"\" fails because of error \"missingRequired\""]}, "contributors": {"URL Errors": ["On row 2, column \"orcid\", value \"0000-0002-8928-abcd\" fails because of error \"Exception\": ORCID 0000-0002-8928-abcd does not exist."], "Validation Errors": ["On row 0, column \"orcid\", value \"0000-0002-8928-abcd\" fails because of error \"invalidValueFormat\""]}}}
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Juan Puerto Juan Puerto PSC 0000-0002-8928-741X Yes No No 94dae6f8-0756-4ab0-a47b-138e446a9501
Juan Puerto Juan Puerto PSC 0000-0002-1825-0097 Yes No No 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
first_name last_name middle_name_or_initial display_name affiliation orcid email is_contact is_principal_investigator is_operator metadata_schema_id
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-8928-741X [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
Firstname Lastname M. Firstname M. Lastname Carnegie Mellon University 0000-0002-1825-0097 [email protected] Yes Yes Yes 94dae6f8-0756-4ab0-a47b-138e446a9501
42 changes: 29 additions & 13 deletions src/ingest_validation_tools/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def _validate(
logging.info(f"{tsv_path}: Offline validation selected, cannot reach API.")
return errors
else:
api_errors = self.online_checks(tsv_path, schema_version.schema_name, report_type)
api_errors = self.online_checks(tsv_path, schema_version, report_type)
if api_errors:
api_validated[f"{tsv_path}"] = api_errors
if local_validated:
Expand All @@ -342,10 +342,10 @@ def _validate(
def online_checks(
self,
tsv_path: str,
schema_name: str,
schema: SchemaVersion,
report_type: ReportType = ReportType.STR,
) -> Optional[Dict]:
url_errors = self._url_checks(tsv_path, schema_name, report_type)
url_errors = self._url_checks(tsv_path, schema, report_type)
api_errors = self._api_validation(Path(tsv_path), report_type)
if url_errors or api_errors:
return url_errors | api_errors
Expand Down Expand Up @@ -526,21 +526,22 @@ def _cedar_api_call(self, tsv_path: Union[str, Path]) -> requests.models.Respons
return response

def _url_checks(
self, tsv_path: str, schema_name: str, report_type: ReportType = ReportType.STR
self, tsv_path: str, schema: SchemaVersion, report_type: ReportType = ReportType.STR
):
"""
Check provided UUIDs/HuBMAP IDs for parent_id, sample_id, organ_id.
Not using get_table_errors because CEDAR schema fields do not match
the TSV fields, which makes frictionless confused and upset.
Check provided values for parent_sample_id and orcid_id; additionally
check sample_id, organ_id, and source_id values in single TSV validation
via validation_utils.get_tsv_errors.
"""
errors: Dict = {}

# assay -> parent_sample_id
# sample -> sample_id
# organ -> organ_id
# contributors -> orcid_id
# contributors -> orcid (new) / orcid_id (old)

constrained_fields = {}
schema_name = schema.schema_name

if "sample" in schema_name:
constrained_fields["sample_id"] = self.app_context.get("entities_url")
Expand All @@ -549,7 +550,14 @@ def _url_checks(
elif "murine-source" in schema_name:
constrained_fields["source_id"] = self.app_context.get("entities_url")
elif "contributors" in schema_name:
constrained_fields["orcid_id"] = "https://pub.orcid.org/v3.0/"
if schema.is_cedar:
constrained_fields["orcid"] = (
"https://pub.orcid.org/v3.0/expanded-search/?q=orcid:"
)
else:
constrained_fields["orcid_id"] = (
"https://pub.orcid.org/v3.0/expanded-search/?q=orcid:"
)
else:
constrained_fields["parent_sample_id"] = self.app_context.get("entities_url")

Expand Down Expand Up @@ -593,13 +601,21 @@ def _check_single_url(
) -> Optional[Dict]:
try:
url = constrained_fields[field] + value
if field != "orcid_id":
if field not in ["orcid_id", "orcid"]:
headers = self.app_context.get("request_header", {})
headers["Authorization"] = f"Bearer {self.globus_token}"
response = requests.get(url, headers=headers)
response.raise_for_status()
else:
headers = {}
response = requests.get(url, headers=headers)
response.raise_for_status()
headers = {"Accept": "application/json"}
response = requests.get(url, headers=headers)
num_found = response.json().get("num-found")
if num_found == 1:
return
elif num_found == 0:
raise Exception(f"ORCID {value} does not exist.")
else:
raise Exception(f"Found {num_found} matches for ORCID {value}.")
except Exception as e:
error = {
"errorType": type(e).__name__,
Expand Down
16 changes: 13 additions & 3 deletions tests-manual/update_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from ingest_validation_tools.cli_utils import dir_path
from ingest_validation_tools.error_report import ErrorReport
from ingest_validation_tools.schema_loader import SchemaVersion
from ingest_validation_tools.table_validator import ReportType
from ingest_validation_tools.upload import Upload
from ingest_validation_tools.validation_utils import TSVError, read_rows
Expand Down Expand Up @@ -70,7 +71,12 @@ def update_test_data(self) -> Dict[str, List]:
raise Exception(
f"Something went wrong with Spreadsheet Validator request for {self.dir}."
)
elif "Unauthorized" in report.as_md():
# Necessary to avoid including 'Unauthorized' in output when no Globus token is provided,
# but only relevant for entity-api links.
elif (
"No token" in report.as_md()
or "Unauthorized for url: https://entity.api" in report.as_md()
):
raise Exception(
f"URL checking returned 'Unauthorized' in response while checking {self.dir}; did you forget a Globus token?"
)
Expand Down Expand Up @@ -165,7 +171,7 @@ def update_fixtures(self, report: ErrorReport) -> Dict:
continue
if schema.is_cedar:
new_validation_data[schema.schema_name] = self.upload.online_checks(
path, schema.schema_name, ReportType.STR
path, schema, ReportType.STR
)
for other_type, paths in {
"antibodies": schema.antibodies_paths,
Expand All @@ -185,7 +191,11 @@ def update_fixtures(self, report: ErrorReport) -> Dict:
def other_type_online_checks(self, path: str, other_type: str) -> Dict:
metadata_schema_id = read_rows(Path(path), "ascii")[0].get("metadata_schema_id")
if metadata_schema_id:
return {Path(path).stem: self.upload.online_checks(path, other_type, ReportType.STR)}
return {
Path(path).stem: self.upload.online_checks(
path, SchemaVersion(schema_name=other_type, is_cedar=True), ReportType.STR
)
}
return {}

def open_or_create_fixtures(self) -> Dict:
Expand Down
13 changes: 7 additions & 6 deletions tests/test_dataset_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from unittest.mock import Mock, call, patch

from ingest_validation_tools.error_report import ErrorReport
from ingest_validation_tools.schema_loader import SchemaVersion
from ingest_validation_tools.upload import Upload

from .fixtures import (
Expand Down Expand Up @@ -137,10 +138,10 @@ def _open_and_read_fixtures_file(path: str) -> Dict:
return opened


def _online_side_effect(schema_name: str, dir_path: str, *args):
def _online_side_effect(schema: SchemaVersion, dir_path: str, *args):
del args
fixture = _open_and_read_fixtures_file(dir_path)
return fixture.get("validation", {}).get(schema_name, {})
return fixture.get("validation", {}).get(schema.schema_name, {})


def _assaytype_side_effect(path: str, row: Dict, *args, **kwargs):
Expand Down Expand Up @@ -206,8 +207,8 @@ def test_validate_dataset_examples(self, verbose: bool = False):
) as mock_assaytype_data:
with patch(
"ingest_validation_tools.upload.Upload.online_checks",
side_effect=lambda tsv_path, schema_name, report_type: _online_side_effect(
schema_name, test_dir, tsv_path, report_type
side_effect=lambda tsv_path, schema, report_type: _online_side_effect(
schema, test_dir, tsv_path, report_type
),
):
try:
Expand Down Expand Up @@ -266,8 +267,8 @@ def prep_upload(self, test_dir: str, opts: Dict, patch_data: Dict):
):
with patch(
"ingest_validation_tools.upload.Upload.online_checks",
side_effect=lambda tsv_path, schema_name, report_type: _online_side_effect(
schema_name, test_dir, tsv_path, report_type
side_effect=lambda tsv_path, schema, report_type: _online_side_effect(
schema, test_dir, tsv_path, report_type
),
):
with patch(
Expand Down
Loading