Skip to content

Commit

Permalink
documents: find and replace identifiedBy
Browse files Browse the repository at this point in the history
* Searches and replaces contribution identifiedBy values with $refs.

Co-Authored-by: Peter Weber <[email protected]>
  • Loading branch information
rerowep and rerowep committed Nov 18, 2021
1 parent f3d1a69 commit e8d753b
Show file tree
Hide file tree
Showing 15 changed files with 240 additions and 22 deletions.
10 changes: 9 additions & 1 deletion rero_ils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,12 @@ def _(x):
'schedule': crontab(minute=0, hour=1), # Every day at 01:00 UTC,
'enabled': False
},
'find-contribution': {
'task': ('rero_ils.modules.documents.tasks.find_contribution'),
'schedule': crontab(minute=22, hour=22, day_of_week=6),
# Every week on Saturday at 22:22 UTC,
'enabled': False
},
# 'mef-harvester': {
# 'task': 'rero_ils.modules.apiharvester.tasks.harvest_records',
# 'schedule': timedelta(minutes=60),
Expand Down Expand Up @@ -2531,7 +2537,9 @@ def _(x):
RERO_ILS_UI_GIT_HASH = None

#: RERO_ILS MEF specific configurations.
RERO_ILS_MEF_URL = 'https://{host}/api/mef/'.format(host='mef.rero.ch')
# TODO: to be changed with new MEF version
# RERO_ILS_MEF_AGENTS_URL = 'https://mef.rero.ch/api/agents'
RERO_ILS_MEF_AGENTS_URL = 'https://mef.rero.ch/api'
RERO_ILS_MEF_RESULT_SIZE = 100

RERO_ILS_APP_HELP_PAGE = (
Expand Down
12 changes: 12 additions & 0 deletions rero_ils/modules/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@

from ..documents.api import Document, DocumentsIndexer, DocumentsSearch
from ..documents.dojson.contrib.marc21tojson import marc21
from ..documents.tasks import find_contribution as task_find_contribution
from ..documents.views import get_cover_art
from ..items.api import Item
from ..libraries.api import Library
Expand Down Expand Up @@ -1389,3 +1390,14 @@ def add_cover_urls(verbose):
url = get_cover_art(record=record, save_cover_url=True)
if verbose:
click.echo(f'{idx}:\tdocument: {pid}\t{url}')


@utils.command()
@click.option('-v', '--verbose', is_flag=True, default=False)
@with_appcontext
def find_contribution(verbose):
"""Find and replace contributions identifiedBy."""
click.secho('Find contribution.', fg='green')
found, exists, no_idref, no_mef = task_find_contribution(verbose=verbose)
click.echo(f'Found: {found} | Exists: {exists} | '
f'No IdRef: {no_idref} | No MEF: {no_mef}')
8 changes: 4 additions & 4 deletions rero_ils/modules/contributions/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,14 @@ def dumps_for_document(self):
@classmethod
def _get_mef_data_by_type(cls, pid, pid_type, verbose=False):
"""Request MEF REST API in JSON format."""
url = current_app.config.get('RERO_ILS_MEF_URL')
url = current_app.config.get('RERO_ILS_MEF_AGENTS_URL')
if pid_type == 'mef':
mef_url = f'{url}?q=pid:{pid}'
mef_url = f'{url}/mef/?q=pid:{pid}'
else:
if pid_type == 'viaf':
mef_url = f'{url}?q=viaf_pid:{pid}'
mef_url = f'{url}/mef/?q=viaf_pid:{pid}'
else:
mef_url = f'{url}?q={pid_type}.pid:{pid}'
mef_url = f'{url}/mef/?q={pid_type}.pid:{pid}'
request = requests.get(url=mef_url, params=dict(resolve=1, sources=1))
if request.status_code == requests_codes.ok:
try:
Expand Down
2 changes: 1 addition & 1 deletion rero_ils/modules/contributions/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def mef_proxy(path):
method=request.method,
url=request.url.replace(
request.base_url.replace(path, ''),
current_app.config.get('RERO_ILS_MEF_URL')
f'{current_app.config.get("RERO_ILS_MEF_AGENTS_URL")}/mef/'
),
headers={
key: value for (key, value) in request.headers if key != 'Host'
Expand Down
5 changes: 2 additions & 3 deletions rero_ils/modules/documents/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,13 @@

from flask import current_app
from invenio_circulation.search.api import search_by_pid
from invenio_search.api import RecordsSearch
from jsonschema.exceptions import ValidationError

from .models import DocumentIdentifier, DocumentMetadata
from .utils import edition_format_text, publication_statement_text, \
series_statement_format_text, title_format_text_head
from ..acq_order_lines.api import AcqOrderLinesSearch
from ..api import IlsRecord, IlsRecordsIndexer
from ..api import IlsRecord, IlsRecordsIndexer, IlsRecordsSearch
from ..fetchers import id_fetcher
from ..minters import id_minter
from ..operation_logs.extensions import OperationLogObserverExtension
Expand All @@ -50,7 +49,7 @@
document_id_fetcher = partial(id_fetcher, provider=DocumentProvider)


class DocumentsSearch(RecordsSearch):
class DocumentsSearch(IlsRecordsSearch):
"""DocumentsSearch."""

class Meta:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,20 @@
},
"preferred_name": {
"type": "text"
},
"identifiedBy": {
"type": "object",
"properties": {
"type": {
"type": "keyword"
},
"value": {
"type": "keyword"
},
"source": {
"type": "keyword"
}
}
}
}
},
Expand Down
128 changes: 128 additions & 0 deletions rero_ils/modules/documents/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
# Copyright (C) 2021 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Celery tasks to documents."""

from __future__ import absolute_import, print_function

import click
from celery import shared_task
# from celery.task.control import inspect
from flask import current_app

from .api import Document, DocumentsSearch
from ..contributions.api import Contribution


@shared_task(ignore_result=True)
def find_contribution(verbose=False):
"""Records creation and indexing.
:param verbose: Verbose print.
:returns: cont_found cunt, cont_exists count,
IdRef not cont_found count, MEF not cont_found count
"""
query = DocumentsSearch() \
.filter('exists', field='contribution.agent.identifiedBy') \
.source(['pid']) \
.scan()
cont_found = {}
cont_exists = {}
cont_no_idref = {}
cont_no_mef = {}
mef_url = current_app.config.get('RERO_ILS_MEF_AGENTS_URL')
for hit in query:
doc = Document.get_record_by_id(hit.meta.id)
new_contributions = []
changed = False
for contribution in doc.get('contribution', []):
cont = None
new_contributions.append(contribution)
ref_type = contribution['agent'].get(
'identifiedBy', {}).get('type', '').lower()
ref_pid = contribution['agent'].get(
'identifiedBy', {}).get('value')
ref = f'{ref_type}/{ref_pid}'
if ref_type and ref_pid:
# Try to get existing contribution
cont = Contribution.get_contribution(ref_type, ref_pid)
if not cont:
# contribution does not exist
try:
# try to get the contribution online
data = Contribution._get_mef_data_by_type(
ref_pid, ref_type)
metadata = data['metadata']
if metadata.get('idref'):
cont_found.setdefault(
ref,
{'count': 0, 'mef': metadata.get('pid')}
)
cont_found[ref]['count'] += 1
# create local contribution
metadata.pop('$schema', None)
cont = Contribution.create(
data=metadata, dbcommit=True, reindex=True)
else:
# online contribution has no IdREf
cont_no_idref.setdefault(ref, 0)
cont_no_idref[ref] += 1
except Exception:
# no online contribution found
cont_no_mef.setdefault(ref, 0)
cont_no_mef[ref] += 1
else:
# contribution exist allready
cont_exists.setdefault(ref, 0)
cont_exists[ref] += 1
if cont:
# change the contribution to linked contribution
if cont.get('idref'):
changed = True
url = f'{mef_url}/idref/{cont["idref"]["pid"]}'
new_contributions[-1]['agent'] = {
'$ref': url,
'type': contribution['agent']['type']
}
else:
# contribution has no IdREf
cont_no_idref.setdefault(ref, 0)
cont_no_idref[ref] += 1
if changed:
doc['contribution'] = new_contributions
doc.update(data=doc, dbcommit=True, reindex=True)
if verbose:
if cont_found:
click.secho('Found:', fg='green')
for key, value in cont_found.items():
click.echo(f'\t{key} MEF pid: {value["mef"]} '
f'count: {value["count"]}')
for msg, data in {
'Exist:': cont_exists,
'No IdRef:': cont_no_idref,
'No Mef:': cont_no_mef
}.items():
if data:
click.secho(msg, fg='yellow')
for key, value in data.items():
click.echo(f'\t{key} count: {value}')
return (
len(cont_found),
len(cont_exists),
len(cont_no_idref),
len(cont_no_mef)
)
2 changes: 2 additions & 0 deletions rero_ils/modules/documents/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,8 @@ def create_contributions(contributions):
'parallel_access_point')
if parallel_access_point:
agent['parallel_access_point'] = parallel_access_point
if contribution['agent'].get('identifiedBy'):
agent['identifiedBy'] = contribution['agent']['identifiedBy']
contribution['agent'] = agent

calculated_contributions.append(contribution)
Expand Down
6 changes: 3 additions & 3 deletions rero_ils/modules/holdings/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
from elasticsearch_dsl import Q
from flask import current_app
from flask_babelex import gettext as _
from invenio_search.api import RecordsSearch
from jinja2 import Environment

from rero_ils.modules.items.models import ItemIssueStatus

from .models import HoldingIdentifier, HoldingMetadata, HoldingTypes
from ..api import IlsRecord, IlsRecordError, IlsRecordsIndexer
from ..api import IlsRecord, IlsRecordError, IlsRecordsIndexer, \
IlsRecordsSearch
from ..documents.api import Document
from ..errors import MissingRequiredParameterError, RegularReceiveNotAllowed
from ..fetchers import id_fetcher
Expand Down Expand Up @@ -66,7 +66,7 @@
JINJA_ENV.filters['format_date_filter'] = format_date_filter


class HoldingsSearch(RecordsSearch):
class HoldingsSearch(IlsRecordsSearch):
"""RecordsSearch for holdings."""

class Meta:
Expand Down
2 changes: 1 addition & 1 deletion scripts/setup
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ then
eval ${PREFIX} invenio reroils oaiharvester harvest -n ebooks -q -k
else
eval ${PREFIX} invenio reroils scheduler enable_tasks -n scheduler-timestamp -n bulk-indexer -n anonymize-loans -n claims-creation -n accounts -n clear_and_renew_subscriptions -n collect-stats -v
eval ${PREFIX} invenio reroils scheduler enable_tasks -n notification-creation -n notification-dispatch-availability -n notification-dispatch-recall -v
eval ${PREFIX} invenio reroils scheduler enable_tasks -n notification-creation -n notification-dispatch-availability -n notification-dispatch-recall -n find-contribution -v
eval ${PREFIX} invenio reroils scheduler enable_tasks -n cancel-expired-request -v
info_msg "For ebooks harvesting run:"
msg "\tinvenio reroils oaiharvester harvest -n ebooks -a max=100 -q"
Expand Down
4 changes: 2 additions & 2 deletions tests/api/documents/test_documents_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def test_documents_facets(
author='Nebehay, Christian Michael')
res = client.get(list_url, headers=rero_json_header)
data = get_json(res)
assert data['hits']['total']['value'] == 1
assert data['hits']['total']['value'] == 2

# 2. test deutsch language
list_url = url_for('invenio_records_rest.doc_list', view='global',
Expand All @@ -272,7 +272,7 @@ def test_documents_facets(
author='Nebehay, Christian Michael', lang='thl')
res = client.get(list_url, headers=rero_json_header)
data = get_json(res)
assert data['hits']['total']['value'] == 1
assert data['hits']['total']['value'] == 2


@mock.patch('invenio_records_rest.views.verify_record_permission',
Expand Down
52 changes: 52 additions & 0 deletions tests/api/documents/test_documents_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
# Copyright (C) 2021 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""DOJSON transformation for Dublin Core module tests."""

from __future__ import absolute_import, print_function

from copy import deepcopy

import mock
from utils import mock_response

from rero_ils.modules.documents.api import Document, DocumentsSearch
from rero_ils.modules.documents.tasks import find_contribution


@mock.patch('requests.get')
def test_find_contribution(mock_contributions_mef_get, app, document_data,
contribution_person_response_data):
"""Test find contribution."""

assert find_contribution() == (0, 0, 0, 0)

doc = Document.create(data=document_data, dbcommit=True, reindex=True)
DocumentsSearch.flush_and_refresh()
assert find_contribution() == (0, 0, 0, 1)

without_idref = deepcopy(contribution_person_response_data)
without_idref['hits']['hits'][0]['metadata'].pop('idref')
mock_contributions_mef_get.return_value = mock_response(
json_data=without_idref
)
assert find_contribution() == (0, 0, 1, 0)

mock_contributions_mef_get.return_value = mock_response(
json_data=contribution_person_response_data
)
assert find_contribution() == (1, 0, 0, 0)
10 changes: 7 additions & 3 deletions tests/data/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -1666,8 +1666,12 @@
"contribution": [
{
"agent": {
"preferred_name": "Vincent, Sophie",
"type": "bf:Person"
"preferred_name": "Nebehay, Christian Michael",
"type": "bf:Person",
"identifiedBy": {
"type": "RERO",
"value": "A003633163"
}
},
"role": [
"aut"
Expand Down Expand Up @@ -4129,4 +4133,4 @@
"home_phone": "+012024561414",
"keep_history": true
}
}
}
2 changes: 1 addition & 1 deletion tests/ui/documents/test_documents_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ def test_work_access_point():

def test_contribution_format(db, document_data):
"""Test contribution format."""
result = 'Vincent, Sophie'
result = 'Nebehay, Christian Michael'
doc = Document.create(document_data, delete_pid=True)
assert contribution_format(doc.pid, 'en', 'global').startswith(result)

Expand Down
Loading

0 comments on commit e8d753b

Please sign in to comment.