Skip to content

Commit

Permalink
github: added invenio github integration
Browse files Browse the repository at this point in the history
  • Loading branch information
alejandromumo committed Jun 27, 2023
1 parent 6686b98 commit 95a03cf
Show file tree
Hide file tree
Showing 5 changed files with 347 additions and 0 deletions.
4 changes: 4 additions & 0 deletions invenio_rdm_records/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from .services.permissions import RDMRecordPermissionPolicy
from .services.pids import providers

from .github.release import RDMGithubRelease

# Invenio-RDM-Records
# ===================

Expand Down Expand Up @@ -479,3 +481,5 @@ def make_doi(prefix, record):
...
}]
"""

GITHUB_RELEASE_CLASS = RDMGithubRelease
7 changes: 7 additions & 0 deletions invenio_rdm_records/github/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""RDM records implementation of Github."""
290 changes: 290 additions & 0 deletions invenio_rdm_records/github/release.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""Github release API implementation."""

import requests
import yaml
from flask import current_app
from invenio_github.api import GitHubRelease
from invenio_github.models import ReleaseStatus
from marshmallow import ValidationError
from mistune import markdown

from invenio_access.permissions import system_identity
from invenio_db import db
from invenio_pidstore.models import PersistentIdentifier
from invenio_rdm_records.github.utils import get_extra_metadata, load_citation_metadata
from invenio_rdm_records.proxies import current_rdm_records_service
from invenio_records_resources.services.uow import UnitOfWork


class RDMGithubRelease(GitHubRelease):
"""Implement release API instance for RDM."""

def _related_identifiers(self):
"""Return related identifiers."""
repo_name = self.repository["full_name"]
release_tag_name = self.release["tag_name"]
return {
"identifier": "https://github.com/{}/tree/{}".format(
repo_name, release_tag_name
),
"scheme": "url",
"relation_type": {"id": "issupplementto"},
"resource_type": {"id": "software"},
}

def _title(self):
"""Generate a title from a release and its repository name."""
repo_name = self.repository_object.name
release_name = self.release.get("name") or self.release.get(
"tag_name", self.release_object.tag
)
return f"{repo_name}: {release_name}"

def _description(self):
"""Extract description from a release.
If the relesae does not have any body, the repository description is used.
Falls back for "No description provided".
"""
if self.release.get("body"):
return markdown(self.release["body"])
elif self.repository.get("description"):
return self.repository["description"]
return "No description provided."

def _default_metadata(self):
"""Return default metadata for a release."""
# Get default right from app config or use cc-by-4.0 if default is not set in app
# TODO use the default software license
default_right = "cc-by-4.0"
version = self.release.get("tag_name", "")

return dict(
description=self._description(),
rights={"id": default_right},
publication_date=self.release["published_at"][:10],
related_identifiers=[self._related_identifiers()],
version=version,
title=self._title(),
resource_type={"id": "software"},
creators=[
{
"person_or_org": {
"type": "personal",
"given_name": "TODO GIVEN NAME", # TODO get self.user.given_name (self.repository_object.user)
"family_name": "TODO FAMILY NAME", # TODO get self.user.family_name (self.repository_object.user)
}
}
],
)

@property
def metadata(self):
"""Extracts metadata to create an RDM draft."""
output = dict(self._default_metadata())
# output.update(self._extra_metadata())
output.update(self._citation_metadata())
return output

# TODO what to do with this one?
# TODO this is very instance specific, e.g. .zenodo.json
# TODO it can be added later
def _extra_metadata(self):
"""Get extra metadata for file in repository."""
return get_extra_metadata(
self.gh.api,
self.repository["owner"]["login"],
self.repository["name"],
self.release["tag_name"],
)

def _citation_metadata(self):
"""Get citation metadata for file in repository."""
citation_file_path = current_app.config.get("GITHUB_CITATION_FILE")

if not citation_file_path:
return {}

try:
# Read raw data from file
data = self.load_citation_file()

# Load metadata
metadata = load_citation_metadata(data)
return metadata
except ValidationError as e:
self.release_object.errors.update({citation_file_path: e.messages})

def _test_zipball(self):
"""Extract files to download from GitHub payload."""
zipball_url = self.release["zipball_url"]

# Execute a HEAD request to the zipball url to test the url.
response = self.gh.api.session.head(zipball_url, allow_redirects=True)

# In case where there is a tag and branch with the same name, we might
# get back a "300 Mutliple Choices" response, which requires fetching
# an "alternate" link.
if response.status_code == 300:
zipball_url = response.links.get("alternate", {}).get("url")
if zipball_url:
response = self.gh.api.session.head(zipball_url, allow_redirects=True)
# Another edge-case, is when the access token we have does not
# have the scopes/permissions to access public links. In that
# rare case we fallback to a non-authenticated request.
if response.status_code == 404:
response = requests.head(zipball_url, allow_redirects=True)
# If this response is successful we want to use the finally
# resolved URL to fetch the ZIP from.
if response.status_code == 200:
zipball_url = response.url

assert (
response.status_code == 200
), f"Could not retrieve archive from GitHub: {zipball_url}"

def load_citation_file(self):
"""Returns the citation file data."""
citation_file_name = current_app.config.get("GITHUB_CITATION_FILE")
if not citation_file_name:
return {}

# Fetch the citation file and load it
content = self.retrieve_remote_file(citation_file_name)

data = (
yaml.safe_load(content.decoded.decode("utf-8"))
if content is not None
else None
)

return data

def resolve_record(self):
"""Resolves an RDM record from a release."""
recid = self._retrieve_record_by_uuid(self.release_object.record_id)
return current_rdm_records_service.read(system_identity, recid.pid_value)

def _is_first_release(self):
"""Checks whether the current release is the first release of the repository."""
return self.repository_object.releases.count() == 0

def _retrieve_recid_by_uuid(self, rec_uuid):
"""Retrieves a persistent identifier given its objects uuid.
Helper function.
"""
recid = PersistentIdentifier.get_by_object(
pid_type="recid",
object_uuid=rec_uuid,
object_type="rec",
)
return recid

def publish(self):
"""Publish GitHub release as record.
Drafts and records are created using the current records service.
The following steps are run inside a single transaction:
- Create a draft.
- The draft's ownership is set to the user's id via its parent.
- Upload files to the draft.
- Publish the draft.
In case of failure, the transaction is rolled back and the release status set to 'FAILED'
:raises ex: any exception generated by the records service (e.g. invalid metadata)
"""
try:
self.release_object.status = ReleaseStatus.PROCESSING
data = {
"metadata": self.metadata,
"access": {"record": "public", "files": "public"},
"files": {"enabled": True},
}

with UnitOfWork(db.session) as uow:
if self._is_first_release():
draft = current_rdm_records_service.create(
self.user_identity, data, uow=uow
)
else:
# Retrieve latest record id and its recid
latest_record_uuid = self.repository_object.latest_release(
ReleaseStatus.PUBLISHED
).record_id

recid = self._retrieve_recid_by_uuid(latest_record_uuid)

# Create a new version and update its contents
new_version_draft = current_rdm_records_service.new_version(
self.user_identity, recid.pid_value, uow=uow
)
draft = current_rdm_records_service.update_draft(
self.user_identity, new_version_draft.id, data, uow=uow
)

# Validate the release files are fetchable
self._test_zipball()

# Upload files to draft
draft_file_service = current_rdm_records_service.draft_files

draft_file_service.init_files(
self.user_identity,
draft.id,
data=[{"key": self.release_file_name}],
uow=uow,
)

with self.fetch_zipball_file() as file_stream:
draft_file_service.set_file_content(
self.user_identity,
draft.id,
self.release_file_name,
file_stream,
uow=uow,
)

draft_file_service.commit_file(
self.user_identity, draft.id, self.release_file_name, uow=uow
)

record = current_rdm_records_service.publish(
self.user_identity, draft.id, uow=uow
)

# Update release weak reference and set status to PUBLISHED
self.release_object.record_id = record._record.model.id
self.release_object.status = ReleaseStatus.PUBLISHED

# UOW must be committed manually since we're not using the decorator
uow.commit()
return record
except Exception as ex:
# Flag release as FAILED and raise the exception
self.release_object.status = ReleaseStatus.FAILED
raise ex

def process_release(self):
"""Processes a github release.
The release might be first validated, in terms of sender, and then published.
:raises ex: any exception generated by the records service when creating a draft or publishing the release record.
"""
try:
record = self.publish()
return record
except Exception as ex:
current_app.logger.exception(
f"Error while processing GitHub release {self.release_object.id}: {str(ex)}"
)
raise ex
45 changes: 45 additions & 0 deletions invenio_rdm_records/github/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""Utility functions."""

import json

import yaml
from flask import current_app
from invenio_github.errors import CustomGitHubMetadataError
from marshmallow import Schema


def get_extra_metadata(gh, owner, repo_name, ref):
"""Get the metadata file."""
# TODO probably deprecated. On Zenodo we kept it for backwards compatibility
try:
content = gh.repository(owner, repo_name).file_contents(
path=current_app.config["GITHUB_METADATA_FILE"], ref=ref
)
if not content:
# File does not exists in the given ref
return {}
return json.loads(content.decoded.decode("utf-8"))
except ValueError:
raise CustomGitHubMetadataError(file=current_app.config["GITHUB_METADATA_FILE"])


def load_citation_metadata(citation_data):
"""Get the metadata file."""
if not citation_data:
return {}

citation_schema = current_app.config.get("GITHUB_CITATION_METADATA_SCHEMA")

assert isinstance(
citation_schema, Schema
), "Citation schema is needed to load citation metadata."

data = citation_schema().load(citation_data)

return data
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ install_requires =
pytz>=2020.4
pyyaml>=5.4.0
python-slugify>=8.0.1
invenio-github>=1.0.0a8 # TODO actually bump it

[options.extras_require]
tests =
Expand Down

0 comments on commit 95a03cf

Please sign in to comment.