Skip to content

Commit

Permalink
[ENG-5136] datacite 4.5 (#10529)
Browse files Browse the repository at this point in the history
## Purpose
support datacite 4.5 (most especially the new resourceTypeGeneral options)

## Changes
- update static datacite xsd (used for xml metadata validation) to 4.5
- update all "4.4" datacite iris (starting with "https://schema.datacite.org/meta/kernel-4.4/")
  - use "/kernel-4/" for concepts (pin to datacite's major version, avoid another recatalog for this reason until datacite 5 (if ever))
  - use "/kernel-4.5/" for specific XSD files (pin to datacite's minor version, get reliable validation)
- add `--datacite-custom-types` arg to `recatalog_metadata` management command
- add "Instrument" and "StudyRegistration" resource type options
- update metadata tests accordingly

## Side Effects
- changing the iri used to represent datacite types in metadata will cause minor inconsistencies in the search index
  - only ui-visible effect will be duplicate values in the "resource type" search filter facet on osf search
  - resolve with `manage.py recatalog_metadata --datacite-custom-types` (may take a short while in large environments)
  - note: the change to `/kernel-4/` iris instead of `/kernel-4.4/` (or `/kernel-4.5/`) means we won't have this problem with future minor versions (4.6, etc), just major versions (which may work entirely differently, anyway)

## Ticket
[ENG-5136]
  • Loading branch information
aaxelb authored Feb 13, 2024
1 parent 13f633d commit a653b35
Show file tree
Hide file tree
Showing 39 changed files with 428 additions and 324 deletions.
99 changes: 65 additions & 34 deletions osf/management/commands/recatalog_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,23 @@
logger = logging.getLogger(__name__)


def recatalog(provided_model, providers, start_id, chunk_count, chunk_size):
def recatalog(queryset, start_id, chunk_count, chunk_size):
_chunk_start_id = start_id
for _ in range(chunk_count):
_last_id = recatalog_chunk(provided_model, providers, _chunk_start_id, chunk_size)
_last_id = recatalog_chunk(queryset, _chunk_start_id, chunk_size)
if _last_id is None:
logger.info('All done!')
return
_chunk_start_id = _last_id + 1


def recatalog_chunk(provided_model, providers, start_id, chunk_size):
items = provided_model.objects.filter(
id__gte=start_id,
).order_by('id')

if providers is not None:
items = items.filter(provider__in=providers)

item_chunk = list(items[:chunk_size])
def recatalog_chunk(queryset, start_id, chunk_size):
item_chunk = list(
queryset
.filter(id__gte=start_id)
.order_by('id')
[:chunk_size]
)
last_id = None
if item_chunk:
first_id = item_chunk[0].id
Expand All @@ -46,29 +44,34 @@ def recatalog_chunk(provided_model, providers, start_id, chunk_size):
else:
logger.debug('skipping item without guid: %s', item)

logger.info(f'Queued metadata recataloguing for {len(item_chunk)} {provided_model.__name__}ses (ids in range [{first_id},{last_id}])')
logger.info(f'Queued metadata recataloguing for {len(item_chunk)} {queryset.model.__name__}ses (ids in range [{first_id},{last_id}])')
else:
logger.info(f'Done recataloguing metadata for {provided_model.__name__}ses!')
logger.info(f'Done recataloguing metadata for {queryset.model.__name__}ses!')

return last_id


class Command(BaseCommand):
def add_arguments(self, parser):
provider_group = parser.add_mutually_exclusive_group(required=True)
provider_group.add_argument(
'--providers',
type=str,
nargs='+',
help='recatalog metadata for items from specific providers (by `_id`)',
)
provider_group.add_argument(
'--all-providers',
'-a',
action='store_true',
help='recatalog metadata for items from all providers',
def _recatalog_all(queryset, chunk_size):
recatalog(queryset, start_id=0, chunk_count=int(9e9), chunk_size=chunk_size)


def _recatalog_datacite_custom_types(chunk_size):
logger.info('recataloguing items with datacite custom type...')
# all preprints
_recatalog_all(Preprint.objects, chunk_size)
# objects with custom resource_type_general
for _model in {Registration, Node, OsfStorageFile}:
_queryset = (
_model.objects
.exclude(guids__metadata_record__isnull=True)
.exclude(guids__metadata_record__resource_type_general='')
)
_recatalog_all(_queryset, chunk_size)
logger.info('done recataloguing items with datacite custom type!')


class Command(BaseCommand):
def add_arguments(self, parser):
type_group = parser.add_mutually_exclusive_group(required=True)
type_group.add_argument(
'--all-types',
Expand Down Expand Up @@ -100,6 +103,28 @@ def add_arguments(self, parser):
action='store_true',
help='recatalog metadata for users',
)
type_group.add_argument(
'--datacite-custom-types',
action='store_true',
help='''recatalog metadata for items with a specific datacite type,
including all preprints and items with custom resource_type_general
(may be slow for lack of database indexes)
''',
)

provider_group = parser.add_mutually_exclusive_group()
provider_group.add_argument(
'--providers',
type=str,
nargs='+',
help='recatalog metadata for items from specific providers (by `_id`)',
)
provider_group.add_argument(
'--all-providers',
'-a',
action='store_true',
help='recatalog metadata for items from all providers (default if no --providers given)',
)

parser.add_argument(
'--start-id',
Expand All @@ -121,22 +146,23 @@ def add_arguments(self, parser):
)

def handle(self, *args, **options):
pls_all_providers = options['all_providers']
pls_all_types = options['all_types']
pls_recatalog_preprints = options['preprints']
pls_recatalog_registrations = options['registrations']
pls_recatalog_projects = options['projects']
pls_recatalog_files = options['files']
pls_recatalog_users = options['users']
provider_ids = options.get('providers')
start_id = options['start_id']
chunk_size = options['chunk_size']
chunk_count = options['chunk_count']
datacite_custom_types = options['datacite_custom_types']

if pls_all_providers:
providers = None # `None` means "don't filter by provider"
else:
provider_ids = options['providers']
providers = AbstractProvider.objects.filter(_id__in=provider_ids)
if datacite_custom_types: # temporary arg for datacite 4.5 migration
assert not start_id, 'oh no, cannot resume with `--datacite-custom-types`'
assert not provider_ids, 'oh no, cannot filter providers with `--datacite-custom-types`'
_recatalog_datacite_custom_types(chunk_size)
return # end

if pls_all_types:
assert not start_id, 'choose a specific type to resume with --start-id'
Expand All @@ -154,4 +180,9 @@ def handle(self, *args, **options):
provided_models = [OSFUser]

for provided_model in provided_models:
recatalog(provided_model, providers, start_id, chunk_count, chunk_size)
_queryset = provided_model.objects
if provider_ids is not None:
_queryset = _queryset.filter(
provider__in=AbstractProvider.objects.filter(_id__in=provider_ids),
)
recatalog(_queryset, start_id, chunk_count, chunk_size)
Loading

0 comments on commit a653b35

Please sign in to comment.