Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add invenio GitHub #1337

Merged
merged 3 commits into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions invenio_rdm_records/services/github/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""RDM records implementation of Github."""
168 changes: 168 additions & 0 deletions invenio_rdm_records/services/github/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""RDM github release metadata."""
import yaml
from flask import current_app
from invenio_github.errors import CustomGitHubMetadataError
from invenio_i18n import _
from marshmallow import Schema, ValidationError
from mistune import markdown
from nameparser import HumanName


class RDMReleaseMetadata(object):
"""Wraps a realease object to extract its data to meet RDM specific needs."""

def __init__(self, rdm_github_release):
"""Constructor."""
self.rdm_release = rdm_github_release

@property
def related_identifiers(self):
"""Return related identifiers."""
repo_name = self.rdm_release.repository_payload["full_name"]
release_tag_name = self.rdm_release.release_payload["tag_name"]
return {
"identifier": "https://github.com/{}/tree/{}".format(
repo_name, release_tag_name
),
"scheme": "url",
"relation_type": {"id": "issupplementto"},
"resource_type": {"id": "software"},
}

@property
def title(self):
"""Generate a title from a release and its repository name."""
repo_name = self.rdm_release.repository_payload["full_name"]
release_name = (
self.rdm_release.release_payload.get("name")
or self.rdm_release.release_payload["tag_name"]
)
return f"{repo_name}: {release_name}"

@property
def description(self):
"""Extract description from a release.

If the relesae does not have any body, the repository description is used.
Falls back for "No description provided".
"""
if self.rdm_release.release_payload.get("body"):
return markdown(self.rdm_release.release_payload["body"])
elif self.rdm_release.repository_payload.get("description"):
return self.rdm_release.repository_payload["description"]
return _("No description provided.")

@property
def default_metadata(self):
"""Return default metadata for a release."""
# Get default right from app config or use cc-by-4.0 if default is not set in app
# TODO use the default software license
default_right = "cc-by-4.0"
version = self.rdm_release.release_payload.get("tag_name", "")

return dict(
description=self.description,
rights={"id": default_right},
publication_date=self.rdm_release.release_payload["published_at"][:10],
related_identifiers=[self.related_identifiers],
version=version,
title=self.title,
resource_type={"id": "software"},
creators=self.contributors,
)

@property
def contributors(self):
"""Serializes contributors retrieved from github."""

def serialize_author(gh_data):
"""Serializes github contributor data into RDM author."""
login = gh_data["login"]
name = gh_data.get("name", login)
company = gh_data.get("company", "")

human_name = HumanName(name)
given_name = human_name.first
family_name = human_name.surnames

rdm_contributor = {
"person_or_org": {
"type": "personal",
"given_name": given_name,
"family_name": family_name,
},
"affiliations": [{"name": company}],
}
return rdm_contributor

contributors = []

# Get contributors from api
for c in self.rdm_release.contributors:
rdm_author = serialize_author(c)
contributors.append(rdm_author)

return contributors

@property
def citation_metadata(self):
"""Get citation metadata for file in repository."""
citation_file_path = current_app.config.get("GITHUB_CITATION_FILE")

if not citation_file_path:
return {}

try:
# Read raw data from file
data = self.load_citation_file(citation_file_path)

# Load metadata from citation file and serialize it
metadata = self.load_citation_metadata(data)
return self.serialize_citation_metadata(metadata)
except ValidationError as e:
# Wrap the error into CustomGitHubMetadataError() so it can be handled upstream
raise CustomGitHubMetadataError(file=citation_file_path, message=e.messages)

def serialize_citation_metadata(self, data):
"""Serializes citation data to RDM."""
if not data:
return {}
# TODO to be implemented
return data

def load_citation_file(self, citation_file_name):
"""Returns the citation file data."""
if not citation_file_name:
return {}

# Fetch the citation file and load it
content = self.retrieve_remote_file(citation_file_name)

data = (
yaml.safe_load(content.decoded.decode("utf-8"))
if content is not None
else None
)

return data

def load_citation_metadata(self, citation_data):
"""Get the metadata file."""
if not citation_data:
return {}

citation_schema = current_app.config.get("GITHUB_CITATION_METADATA_SCHEMA")

assert isinstance(citation_schema, Schema), _(
"Citation schema is needed to load citation metadata."
)

data = citation_schema().load(citation_data)

return data
149 changes: 149 additions & 0 deletions invenio_rdm_records/services/github/release.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""Github release API implementation."""

from flask import current_app
from invenio_access.permissions import system_identity
from invenio_db import db
from invenio_github.api import GitHubRelease
from invenio_github.models import ReleaseStatus
from invenio_records_resources.services.uow import UnitOfWork

from invenio_rdm_records.proxies import current_rdm_records_service
from invenio_rdm_records.resources.serializers.ui import UIJSONSerializer
from invenio_rdm_records.services.github.metadata import RDMReleaseMetadata
from invenio_rdm_records.services.github.utils import retrieve_recid_by_uuid


class RDMGithubRelease(GitHubRelease):
"""Implement release API instance for RDM."""

@property
def metadata(self):
"""Extracts metadata to create an RDM draft."""
metadata = RDMReleaseMetadata(self)
output = metadata.default_metadata
output.update(metadata.citation_metadata)
return output

def resolve_record(self):
"""Resolves an RDM record from a release."""
if not self.release_object.record_id:
return None
recid = retrieve_recid_by_uuid(self.release_object.record_id)
return current_rdm_records_service.read(system_identity, recid.pid_value)

def publish(self):
"""Publish GitHub release as record.

Drafts and records are created using the current records service.
The following steps are run inside a single transaction:

- Create a draft.
- The draft's ownership is set to the user's id via its parent.
- Upload files to the draft.
- Publish the draft.

In case of failure, the transaction is rolled back and the release status set to 'FAILED'

:raises ex: any exception generated by the records service (e.g. invalid metadata)
"""
try:
self.release_processing()
# Commit state change, in case the publishing is stuck
db.session.commit()

with UnitOfWork(db.session) as uow:
data = {
"metadata": self.metadata,
"access": {"record": "public", "files": "public"},
"files": {"enabled": True},
}

if self.is_first_release():
draft = current_rdm_records_service.create(
self.user_identity, data, uow=uow
)
else:
# Retrieve latest record id and its recid
latest_record_uuid = self.repository_object.latest_release(
ReleaseStatus.PUBLISHED
).record_id

recid = retrieve_recid_by_uuid(latest_record_uuid)

# Create a new version and update its contents
new_version_draft = current_rdm_records_service.new_version(
self.user_identity, recid.pid_value, uow=uow
)
draft = current_rdm_records_service.update_draft(
self.user_identity, new_version_draft.id, data, uow=uow
)

# Validate the release files are fetchable
self.test_zipball()

# Upload files to draft
draft_file_service = current_rdm_records_service.draft_files

draft_file_service.init_files(
self.user_identity,
draft.id,
data=[{"key": self.release_file_name}],
uow=uow,
)

with self.fetch_zipball_file() as file_stream:
draft_file_service.set_file_content(
self.user_identity,
draft.id,
self.release_file_name,
file_stream,
uow=uow,
)

draft_file_service.commit_file(
self.user_identity, draft.id, self.release_file_name, uow=uow
)

record = current_rdm_records_service.publish(
self.user_identity, draft.id, uow=uow
)

# Update release weak reference and set status to PUBLISHED
self.release_object.record_id = record._record.model.id
self.release_published()

# UOW must be committed manually since we're not using the decorator
uow.commit()
return record
except Exception as ex:
# Flag release as FAILED and raise the exception
self.release_failed()
# Commit the FAILED state, other changes were already rollbacked by the UOW
db.session.commit()
raise ex

def process_release(self):
"""Processes a github release.

The release might be first validated, in terms of sender, and then published.

:raises ex: any exception generated by the records service when creating a draft or publishing the release record.
"""
try:
record = self.publish()
return record
except Exception as ex:
current_app.logger.exception(
f"Error while processing GitHub release {self.release_object.id}: {str(ex)}"
)
raise ex

def serialize_record(self):
"""Serializes an RDM record."""
return UIJSONSerializer().serialize_object(self.record.data)
24 changes: 24 additions & 0 deletions invenio_rdm_records/services/github/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""Utility functions."""

from flask import current_app
from invenio_pidstore.models import PersistentIdentifier
from marshmallow import Schema


def retrieve_recid_by_uuid(rec_uuid):
"""Retrieves a persistent identifier given its objects uuid.

Helper function.
"""
recid = PersistentIdentifier.get_by_object(
pid_type="recid",
object_uuid=rec_uuid,
object_type="rec",
)
return recid
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@ install_requires =
invenio-administration>=1.2.0,<2.0.0
invenio-communities>=7.0.0,<8.0.0
invenio-drafts-resources>=1.4.1,<2.0.0
invenio-github>=1.0.0a8 # TODO actually bump it
invenio-i18n>=2.0.0,<3.0.0
invenio-oaiserver>=2.0.0,<3.0.0
invenio-vocabularies>=1.2.0,<2.0.0
nameparser>=1.1.1
pytz>=2020.4
pyyaml>=5.4.0
python-slugify>=8.0.1
Expand Down