From 0b6e5566cd54b3841d4507beb1243811c9d6c77b Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Wed, 27 Nov 2024 10:40:04 -0800 Subject: [PATCH] [r] Support for AnVIL duos_id (#6620) --- src/azul/plugins/metadata/anvil/__init__.py | 5 +++++ .../plugins/metadata/anvil/indexer/transform.py | 1 + .../plugins/metadata/anvil/service/response.py | 1 + src/azul/plugins/repository/tdr_anvil/__init__.py | 5 +++-- src/azul/terra.py | 14 ++++++++++---- ...f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json | 1 + test/indexer/test_anvil.py | 8 ++++++-- .../manifest/verbatim/pfb/anvil/pfb_entities.json | 2 ++ .../manifest/verbatim/pfb/anvil/pfb_schema.json | 8 ++++++++ test/service/test_manifest.py | 6 ++++++ test/service/test_response_anvil.py | 5 +++-- 11 files changed, 46 insertions(+), 10 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index ab4c55441c..35be1410e3 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -156,6 +156,7 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping: 'registered_identifier', 'title', 'data_modality', + 'duos_id', ] }, 'donors': { @@ -351,6 +352,10 @@ def verbatim_pfb_schema(self, is_polymorphic=is_duos_type) ] if is_duos_type: + field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name, + column_name='duos_id', + anvil_datatype='string', + is_polymorphic=True)) field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name, column_name='description', anvil_datatype='string', diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index c5bc430299..df8b958935 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -498,6 +498,7 @@ def _duos_types(cls) -> FieldTypes: return { 'document_id': null_str, 'description': null_str, + 'duos_id': null_str, } def _duos(self, dataset: EntityReference) -> MutableJSON: diff --git a/src/azul/plugins/metadata/anvil/service/response.py b/src/azul/plugins/metadata/anvil/service/response.py index 8d0be3c129..6175bd6473 100644 --- a/src/azul/plugins/metadata/anvil/service/response.py +++ b/src/azul/plugins/metadata/anvil/service/response.py @@ -210,6 +210,7 @@ def _non_pivotal_fields_by_entity_type(self) -> dict[str, set[str]]: }, 'datasets': { 'dataset_id', + 'duos_id', 'title' }, 'diagnoses': { diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index b376636400..5db08d26c2 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -479,7 +479,7 @@ def _supplementary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBund def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: assert not bundle_fqid.is_batched, bundle_fqid - duos_info = self.tdr.get_duos(bundle_fqid.source) + duos_id, duos_info = self.tdr.get_duos(bundle_fqid.source) description = None if duos_info is None else duos_info.get('studyDescription') ref, row = self._get_dataset(bundle_fqid.source.spec) expected_entity_id = change_version(bundle_fqid.uuid, @@ -487,7 +487,8 @@ def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: self.datarepo_row_uuid_version) assert ref.entity_id == expected_entity_id, (ref, bundle_fqid) bundle = TDRAnvilBundle(fqid=bundle_fqid) - bundle.add_entity(ref, self._version, {'description': description}) + entity_row = {'duos_id': duos_id, 'description': description} + bundle.add_entity(ref, self._version, entity_row) # Classify as orphan to suppress the emission of a contribution bundle.add_entity(ref, self._version, dict(row), is_orphan=True) return bundle diff --git a/src/azul/terra.py b/src/azul/terra.py index e5046850a7..fdc61e745f 100644 --- a/src/azul/terra.py +++ b/src/azul/terra.py @@ -646,19 +646,25 @@ def for_registered_user(cls, authentication: OAuth2) -> 'TDRClient': def drs_client(self) -> DRSClient: return DRSClient(http_client=self._http_client) - def get_duos(self, source: TDRSourceRef) -> Optional[MutableJSON]: + def get_duos(self, + source: TDRSourceRef + ) -> tuple[str, MutableJSON] | tuple[None, None]: response = self._retrieve_source(source) try: duos_id = response['duosFirecloudGroup']['duosId'] except (KeyError, TypeError): log.warning('No DUOS ID available for %r', source.spec) - return None + return None, None else: url = self._duos_endpoint('dataset', 'registration', duos_id) response = self._request('GET', url) if response.status == 404: log.warning('No DUOS dataset registration with ID %r from %r', duos_id, source.spec) - return None + return None, None else: - return self._check_response(url, response) + response = self._check_response(url, response) + consent_group = one(response['consentGroups']) + require(duos_id == consent_group['datasetIdentifier'], + 'Mismatched identifiers', duos_id, consent_group) + return duos_id, response diff --git a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json index 9859b200f1..b029436606 100644 --- a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json +++ b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json @@ -2,6 +2,7 @@ "entities": { "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { "description": "Study description from DUOS", + "duos_id": "DUOS-000000", "version": "2022-06-01T00:00:00.000000Z" } }, diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index 1cbc52ba0e..e6f0114dd0 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -75,7 +75,7 @@ def setUpClass(cls) -> None: mock_duos_url = furl('https:://mock_duos.lan') - duos_id = 'foo' + duos_id = 'DUOS-000000' duos_description = 'Study description from DUOS' @classmethod @@ -93,6 +93,9 @@ def _patch_duos(cls) -> None: } })), Mock(spec=HTTPResponse, status=200, data=json.dumps({ + 'consentGroups': [{ + 'datasetIdentifier': cls.duos_id + }], 'studyDescription': cls.duos_description })) ])) @@ -251,8 +254,9 @@ def test_dataset_description(self): # These fields are populated only in the primary bundle self.assertEqual(dataset_ref.entity_id, contents['document_id']) self.assertEqual(['phs000693'], contents['registered_identifier']) - # This field is populated only in the DUOS bundle + # These fields are populated only in the DUOS bundle self.assertEqual('Study description from DUOS', contents['description']) + self.assertEqual('DUOS-000000', contents['duos_id']) else: self.fail(qualifier) self.assertDictEqual(doc_counts, { diff --git a/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json b/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json index 8cb9a00eda..145153fcb3 100644 --- a/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json +++ b/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json @@ -110,6 +110,7 @@ "datarepo_row_id": null, "dataset_id": null, "description": "Study description from DUOS", + "duos_id": "DUOS-000000", "owner": null, "principal_investigator": null, "registered_identifier": null, @@ -282,6 +283,7 @@ "datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf", "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", "description": null, + "duos_id": null, "owner": [ "Debbie Nickerson" ], diff --git a/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json b/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json index 9bdd6fcf66..1f0d38f6f7 100644 --- a/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json +++ b/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json @@ -560,6 +560,14 @@ "string" ] }, + { + "name": "duos_id", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, { "name": "owner", "namespace": "anvil_dataset", diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 8fe99a5893..d5586b2b72 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1845,6 +1845,12 @@ def test_compact_manifest(self): '', '' ), + ( + 'datasets.duos_id', + '', + '', + '', + ), ( 'donors.document_id', '', diff --git a/test/service/test_response_anvil.py b/test/service/test_response_anvil.py index 43aba2e3c8..003bea7bc2 100644 --- a/test/service/test_response_anvil.py +++ b/test/service/test_response_anvil.py @@ -892,7 +892,7 @@ def test_entity_indices(self): } ], 'datasets': [ - {} + {'duos_id': ['DUOS-000000']} ], 'diagnoses': [], 'donors': [], @@ -1220,7 +1220,8 @@ def test_entity_indices(self): None ], 'accessible': True, - 'description': 'Study description from DUOS' + 'description': 'Study description from DUOS', + 'duos_id': 'DUOS-000000' } ], 'diagnoses': [