Skip to content

Commit

Permalink
refactor: prefix doi management commands
Browse files Browse the repository at this point in the history
- prefix all one-off destructive DOI commands with `doi_`
- add reset_staging to mint new DOIs on staging using the datacite
  sandbox, doi_reset_staging -> step 3, doi_mint_parent_codebase_dois
- bump deps for datacite schema 4.5 and django cve
  • Loading branch information
alee committed Nov 19, 2024
1 parent 9c2678f commit e45893c
Show file tree
Hide file tree
Showing 16 changed files with 572 additions and 408 deletions.
110 changes: 74 additions & 36 deletions django/library/doi.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import logging
import re
import time
import threading
import queue
import re
import requests
import threading
import time

from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
Expand All @@ -18,7 +18,7 @@
DataCiteRegistrationLog,
)

from datacite import DataCiteRESTClient, schema43
from datacite import DataCiteRESTClient, schema45
from datacite.errors import (
DataCiteError,
DataCiteNoContentError,
Expand All @@ -37,7 +37,7 @@
IS_STAGING = settings.DEPLOY_ENVIRONMENT.is_staging
IS_PRODUCTION = settings.DEPLOY_ENVIRONMENT.is_production

# prefix is different for (dev & staging) and production environments
# prefix differs across (dev + staging) and production
DATACITE_PREFIX = settings.DATACITE_PREFIX

MAX_DATACITE_API_WORKERS = 25
Expand Down Expand Up @@ -183,8 +183,12 @@ def _datacite_heartbeat_url(self):

def _validate_metadata(self, datacite_metadata: DataCiteSchema):
metadata_dict = datacite_metadata.to_dict()
if not schema43.validate(metadata_dict):
logger.error("Invalid DataCite metadata: %s", metadata_dict)
try:
schema45.validator.validate(metadata_dict)
except Exception as e:
logger.error(
"Invalid DataCite metadata: %s", schema45.tostring(metadata_dict), e
)
raise DataCiteError(f"Invalid DataCite metadata: {metadata_dict}")
return datacite_metadata, metadata_dict

Expand All @@ -202,17 +206,22 @@ def mint_public_doi(self, codebase_or_release: Codebase | CodebaseRelease):
return "XX.DRYXX/XXXX-XRUN", True
if hasattr(codebase_or_release, "datacite"):
del codebase_or_release.datacite
datacite_metadata, metadata_dict = self._validate_metadata(
codebase_or_release.datacite
)

doi = "Unassigned"
http_status = 200
message = "Minted new DOI successfully."

datacite_metadata = codebase_or_release.datacite

try:
datacite_metadata, metadata_dict = self._validate_metadata(
datacite_metadata
)
doi = self.datacite_client.public_doi(
metadata_dict, url=codebase_or_release.permanent_url
)
codebase_or_release.doi = doi
codebase_or_release.save()
except DataCiteError as e:
logger.error(e)
message = str(e)
Expand All @@ -235,7 +244,27 @@ def mint_public_doi(self, codebase_or_release: Codebase | CodebaseRelease):
self._save_log_record(**log_record_dict)
return doi, http_status == 200

@classmethod
def is_metadata_fresh(cls, codebase_or_release: Codebase | CodebaseRelease):
try:
newest_log_entry = DataCiteRegistrationLog.objects.latest_entry(
codebase_or_release
)
# make sure item does not have stale datacite metadata
if hasattr(codebase_or_release, "datacite"):
del codebase_or_release.datacite
return newest_log_entry.metadata_hash == codebase_or_release.datacite.hash()

except DataCiteRegistrationLog.DoesNotExist:
# no logs for this item, metadata is stale
logger.info("No registration logs available for %s", codebase_or_release)

return False

def update_doi_metadata(self, codebase_or_release: Codebase | CodebaseRelease):
if self.is_metadata_fresh(codebase_or_release):
logger.info("No need to update DOI metadata for %s", codebase_or_release)
return True
doi = codebase_or_release.doi
if self.dry_run:
logger.debug("DRY RUN")
Expand Down Expand Up @@ -278,16 +307,10 @@ def update_doi_metadata(self, codebase_or_release: Codebase | CodebaseRelease):
self._save_log_record(**log_record_dict)
return http_status == 200

def mint_new_doi_for_codebase(self, codebase: Codebase) -> str:
return self.mint_public_doi(codebase)

def mint_new_doi_for_release(self, release: CodebaseRelease) -> str:
return self.mint_public_doi(release)

def update_metadata_for_codebase(self, codebase: Codebase) -> bool:
def update_codebase_metadata(self, codebase: Codebase) -> bool:
return self.update_doi_metadata(codebase)

def update_metadata_for_release(self, release: CodebaseRelease) -> bool:
def update_release_metadata(self, release: CodebaseRelease) -> bool:
return self.update_doi_metadata(release)

@staticmethod
Expand Down Expand Up @@ -398,29 +421,44 @@ def _is_same_metadata(sent_data, received_data):
logger.debug("Missing attributes:", missing_attributes)
return False

def check_metadata(self, item) -> bool:
def check_metadata(self, codebase_or_release: Codebase | CodebaseRelease) -> bool:
"""
1. get metadata for item.doi
2. compare if the values match codebase.datacite.metadata
- item: Codebase | CodebaseRelease
"""
if not item.doi:
if self.dry_run:
logger.debug(
"Dry run metadata check for %s", codebase_or_release.datacite.to_dict()
)
return True
if not codebase_or_release.doi:
logger.warning(
"Unnecessary metadata check for non-DOI codebase or release %s",
codebase_or_release,
)
return False
try:
if not self.dry_run:
comses_metadata = item.datacite.to_dict()
datacite_metadata = self.datacite_client.get_metadata(item.doi)
return DataCiteApi._is_same_metadata(comses_metadata, datacite_metadata)
else:
logger.debug(
f"{'Codebase' if isinstance(item, Codebase) else 'CodebaseRelease'} metadata is in sync!"
)
return True
comses_metadata = codebase_or_release.datacite.to_dict()
datacite_metadata = self.datacite_client.get_metadata(
codebase_or_release.doi
)
logger.debug(
"comparing datacite metadata\n\n%s\n\nwith comses metadata\n\n%s",
datacite_metadata,
comses_metadata,
)
return DataCiteApi._is_same_metadata(comses_metadata, datacite_metadata)
except Exception as e:
logger.error(e)
return False

def validate_metadata(self, items):
for item in items:
if item.doi:
yield (item, self.check_metadata(item))

def threaded_metadata_check(self, items):
def loading_animation(thread):
while thread.is_alive():
Expand All @@ -431,7 +469,7 @@ def loading_animation(thread):
def _check_metadata(q: queue.Queue):
with ThreadPoolExecutor(max_workers=MAX_DATACITE_API_WORKERS) as executor:
results = executor.map(
lambda item: (item.pk, self.check_metadata(item)), items
lambda item: (item, self.check_metadata(item)), items
)

q.put(results)
Expand Down Expand Up @@ -524,7 +562,7 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
"""
if not codebase_doi:
# request to DataCite API
codebase_doi = datacite_api.mint_new_doi_for_codebase(codebase)
codebase_doi = datacite_api.mint_public_doi(codebase)

if not codebase_doi:
logger.error(
Expand All @@ -544,7 +582,7 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
Mint DOI for release
"""
# request to DataCite API
release_doi = datacite_api.mint_new_doi_for_release(release)
release_doi = datacite_api.mint_public_doi(release)
if not release_doi:
logger.error("Could not mint DOI for release %s. Skipping.", release.pk)
if interactive:
Expand All @@ -559,7 +597,7 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
"""
Since a new DOI has been minted for the release, we need to update it's parent's metadata (HasVersion)
"""
ok = datacite_api.update_metadata_for_codebase(codebase)
ok = datacite_api.update_codebase_metadata(codebase)
if not ok:
logger.error("Failed to update metadata for codebase %s", codebase.pk)

Expand All @@ -572,15 +610,15 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
next_release = release.get_next_release()

if previous_release and previous_release.doi:
ok = datacite_api.update_metadata_for_release(previous_release)
ok = datacite_api.update_release_metadata(previous_release)
if not ok:
logger.error(
"Failed to update metadata for previous_release %s",
previous_release.pk,
)

if next_release and next_release.doi:
ok = datacite_api.update_metadata_for_release(next_release)
ok = datacite_api.update_release_metadata(next_release)
if not ok:
logger.error(
"Failed to update metadata for next_release %s", next_release.pk
Expand Down Expand Up @@ -619,7 +657,7 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
if invalid_codebases:
logger.error(
"FAILURE: %s Codebases with invalid or missing DOIs: %s",
invalid_codebases.count(),
len(invalid_codebases),
invalid_codebases,
)
else:
Expand Down
64 changes: 0 additions & 64 deletions django/library/management/commands/clean_peer_reviewed_dois_02.py

This file was deleted.

This file was deleted.

Loading

0 comments on commit e45893c

Please sign in to comment.