Skip to content

Commit

Permalink
documents: complete the data conversion
Browse files Browse the repository at this point in the history
    * Implements transformation from Marc21 to JSON RERO ILS for:
        * frequency (L32).
        * bf:usageAndAccessPolicy (L74).
        * document relations (L28).
        * publication_place link form field 752 (L47).
    * closes rero#1617.
    * closes rero#1951.
    * closes rero#1987.
    * closes rero#1996.
Co-Authored-by: Gianni Pante <[email protected]>
  • Loading branch information
reropag committed Jun 13, 2021
1 parent d98129e commit f3ade41
Show file tree
Hide file tree
Showing 3 changed files with 647 additions and 60 deletions.
58 changes: 55 additions & 3 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,18 @@

"""Dojson utils."""

import os
import re
import sys
import traceback
from copy import deepcopy

import click
import requests
from dojson import Overdo, utils

from rero_ils.modules.utils import requests_retry_session

_UNIMARC_LANGUAGES_SCRIPTS = {
'ba': 'latn', # Latin
'ca': 'cyrl', # Cyrillic
Expand Down Expand Up @@ -286,6 +290,8 @@
'z': 'Not applicable'
}

_CONTRIBUTION_TAGS = ['100', '600', '610', '611', '630', '650', '651',
'655', '700', '710', '711']

re_identified = re.compile(r'\((.*)\)(.*)')

Expand Down Expand Up @@ -379,6 +385,35 @@ def remove_trailing_punctuation(
'',
data.rstrip()).rstrip()

def get_contribution_link(bibid, reroid, id, key):
"""Get MEF contribution link.
:params bibid: Bib id from the record.
:params reroid: RERO id from the record.
:params id: $0 from the marc field.
:params key: Tag from the marc field.
:returns: MEF url.
"""
# https://mef.test.rero.ch/api/mef/?q=rero.rero_pid:A012327677
prod_host = 'mef.rero.ch'
test_host = os.environ.get('RERO_ILS_MEF_HOST', 'mef.rero.ch')
mef_url = f'https://{test_host}/api/'

match = re_identified.search(id)
if match and len(match.groups()) == 2 and key[:3] in _CONTRIBUTION_TAGS:
match_type = match.group(1).lower()
match_value = match.group(2)
if match_type == 'idref':
url = f'{mef_url}{match_type}/{match_value}'
response = requests_retry_session().get(url)
status_code = response.status_code
if status_code == requests.codes.ok:
return url.replace(test_host, prod_host)
error_print('WARNING GET MEF CONTRIBUTION:',
bibid, reroid, key, id, url, status_code)
else:
error_print('ERROR GET MEF CONTRIBUTION:', bibid, reroid, key, id)


def add_note(new_note, data):
"""Add a new note to the data avoiding duplicate notes.
Expand Down Expand Up @@ -913,6 +948,7 @@ class ReroIlsMarc21Overdo(ReroIlsOverdo):
has_field_490 = False
has_field_580 = False
content_media_carrier_type = None
links_from_752 = []

def __init__(self, bases=None, entry_point_group=None):
"""Reroilsmarc21overdo init."""
Expand Down Expand Up @@ -962,8 +998,11 @@ def do(self, blob, ignore_missing=True, exception_handlers=None):
self.field_008_data = ''
self.date1_from_008 = None
self.date2_from_008 = None
self.original_date_from_008 = None
self.date_type_from_008 = ''
self.date = {'start_date': None}
self.serial_type = ''
self.is_top_level_record = False
fields_008 = self.get_fields(tag='008')
if fields_008:
self.field_008_data = self.get_control_field_data(
Expand Down Expand Up @@ -995,9 +1034,9 @@ def do(self, blob, ignore_missing=True, exception_handlers=None):
# identifiy a top level record (has 019 $a Niveau supérieur)
regexp = re.compile(r'Niveau sup[eé]rieur', re.IGNORECASE)
fields_019 = self.get_fields(tag='019')
note = ''
notes_from_019_and_351 = []
for field_019 in fields_019:
note = ''
for subfield_a in self.get_subfields(field_019, 'a'):
note += ' | ' + subfield_a
if regexp.search(subfield_a):
Expand Down Expand Up @@ -1037,7 +1076,20 @@ def do(self, blob, ignore_missing=True, exception_handlers=None):
if description_conventions:
self.admin_meta_data['descriptionConventions'] = \
description_conventions
# check presence of specific fields

# build the list of links from filed 752
self.links_from_752 = []
fields_752 = self.get_fields(tag='752')
for field_752 in fields_752:
subfields_d = self.get_subfields(field_752, 'd')
items = get_field_items(field_752['subfields'])

if subfields_d:
identifier = build_identifier(field_752['subfields'])
if identifier:
self.links_from_752.append(identifier)

# check presence of specific fields
self.has_field_490 = len(self.get_fields(tag='490')) > 0
self.has_field_580 = len(self.get_fields(tag='580')) > 0
result = super().do(
Expand Down Expand Up @@ -1107,7 +1159,7 @@ def init_lang_from(fields_041, code):
langs_from_041.append(lang_from_041)
return langs_from_041

self.lang_from_008 = ''
self.lang_from_008 = None
self.langs_from_041_a = []
self.langs_from_041_h = []
try:
Expand Down
188 changes: 137 additions & 51 deletions rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,35 @@

"""rero-ils MARC21 model definition."""

import os
import re

import requests
from dojson import utils
from dojson.utils import GroupableOrderedDict

from rero_ils.dojson.utils import ReroIlsMarc21Overdo, TitlePartList, \
add_note, build_identifier, build_responsibility_data, \
build_string_from_subfields, error_print, \
extract_subtitle_and_parallel_titles_from_field_245_b, get_field_items, \
get_field_link_data, make_year, not_repetitive, re_identified, \
remove_trailing_punctuation
from rero_ils.modules.utils import requests_retry_session
extract_subtitle_and_parallel_titles_from_field_245_b, \
get_contribution_link, get_field_items, get_field_link_data, make_year, \
not_repetitive, remove_trailing_punctuation

_DOCUMENT_RELATION_PER_TAG = {
'770': 'supplement',
'772': 'supplementTo',
'775': 'otherEdition',
'776': 'otherPhysicalFormat',
'777': 'IssuedWith',
'780': 'precededBy',
'785': 'succeededBy',
'787': 'relatedTo',
'533': 'hasReproduction',
'534': 'reproductionOf'
}

_REPRODUCTION_SUBFIELDS_PER_TAG = {
'533': 'abcdemn',
'534': 'cep'
}

_ISSUANCE_MAIN_TYPE_PER_BIB_LEVEL = {
'a': 'rdami:1001',
Expand Down Expand Up @@ -211,41 +226,21 @@

_IDREF_REF_REGEX = re.compile(r'^(?i)\(IdRef\)(.*)?')
_RERO_REF_REGEX = re.compile(r'^(?i)\(RERO\)(.*)?')
_CONTRIBUTION_TAGS = ['100', '600', '610', '611', '630', '650', '651',
'655', '700', '710', '711']


marc21 = ReroIlsMarc21Overdo()


def get_contribution_link(bibid, reroid, id, key):
"""Get MEF contribution link.
:params bibid: Bib id from the record.
:params reroid: RERO id from the record.
:params id: $0 from the marc field.
:params key: Tag from the marc field.
:returns: MEF url.
"""
# https://mef.test.rero.ch/api/mef/?q=rero.rero_pid:A012327677
prod_host = 'mef.rero.ch'
test_host = os.environ.get('RERO_ILS_MEF_HOST', 'mef.rero.ch')
mef_url = f'https://{test_host}/api/'

match = re_identified.search(id)
if match and len(match.groups()) == 2 and key[:3] in _CONTRIBUTION_TAGS:
match_type = match.group(1).lower()
match_value = match.group(2)
if match_type == 'idref':
url = f'{mef_url}{match_type}/{match_value}'
response = requests_retry_session().get(url)
status_code = response.status_code
if status_code == requests.codes.ok:
return url.replace(test_host, prod_host)
error_print('WARNING GET MEF CONTRIBUTION:',
bibid, reroid, key, id, url, status_code)
else:
error_print('ERROR GET MEF CONTRIBUTION:', bibid, reroid, key, id)
def build_place():
"""Build place data for provisionActivity."""
place = {}
if marc21.cantons:
place['canton'] = marc21.cantons[0]
if marc21.country:
place['country'] = marc21.country
if place:
place['type'] = 'bf:Place'
if marc21.links_from_752:
place['identifyBy'] = marc21.links_from_752[0]
return place


@marc21.over('issuance', 'leader')
Expand Down Expand Up @@ -346,7 +341,26 @@ def marc21_to_language(self, key, value):
if fields_264:
error_print('WARNING INVALID 264', marc21.bib_id, marc21.rero_id,
fields_264)
self['provisionActivity'] = [{'type': 'bf:Publication'}]
places = []
publication = {
'type': 'bf:Publication'
}
place = build_place()
if place:
places.append(place)
# parce le link skipping the fist (already used by build_place)
for i in range(1, len(marc21.links_from_752)):
place = {
'country': 'und',
'type': 'bf:Place',
'identifyBy': links_from_752[i]
}
places.append(place)

if places:
publication['place'] = places
self['provisionActivity'] = [publication]

if (marc21.date_type_from_008 == 'q' or
marc21.date_type_from_008 == 'n'):
self['provisionActivity'][0][
Expand Down Expand Up @@ -635,6 +649,33 @@ def marc21_to_contribution(self, key, value):
}


@marc21.over('relation', '(770|772|775|776|777|780|785|787|533|534)..')
@utils.for_each_value
@utils.ignore_value
def marc21_to_specific_document_relation(self, key, value):
"""Get contribution."""
tag = key[:3]
relation = None
if tag in ['533', '534']:
label = build_string_from_subfields(
value,
_REPRODUCTION_SUBFIELDS_PER_TAG[tag]
)
relation = {'label': label}
else:
subfield_w = not_repetitive(marc21.bib_id, marc21.rero_id,
key, value, 'w', default='').strip()
if subfield_w:
match = re.compile(r'^REROILS:')
pid = match.sub('', subfield_w)
ref = f'https://bib.rero.ch/api/documents/{pid}'
relation = {'$ref': ref}
if relation:
relation_list = self.get(_DOCUMENT_RELATION_PER_TAG[tag], [])
relation_list.append(relation)
self[_DOCUMENT_RELATION_PER_TAG[tag]] = relation_list


@marc21.over('copyrightDate', '^264.4')
@utils.ignore_value
def marc21_to_copyright_date(self, key, value):
Expand Down Expand Up @@ -690,10 +731,12 @@ def marc21_to_edition_statement(self, key, value):
def marc21_to_provisionActivity(self, key, value):
"""Get publisher data.
publisher.name: 264 [$b repetitive] (without the , but keep the ;)
publisher.place: 264 [$a repetitive] (without the : but keep the ;)
publicationDate: 264 [$c repetitive] (but take only the first one)
"""

def build_statement(field_value, ind2):

def build_agent_data(code, label, index, link):
Expand Down Expand Up @@ -732,16 +775,6 @@ def build_agent_data(code, label, index, link):
index += 1
return statement

def build_place():
place = {}
if marc21.cantons:
place['canton'] = marc21.cantons[0]
if marc21.country:
place['country'] = marc21.country
if place:
place['type'] = 'bf:Place'
return place

# the function marc21_to_provisionActivity start here
ind2 = key[4]
type_per_ind2 = {
Expand All @@ -763,9 +796,21 @@ def build_place():
publication['endDate'] = marc21.date['end_date']
if 'note' in marc21.date:
publication['note'] = marc21.date['note']

places = []
place = build_place()
if place:
publication['place'] = [place]
places.append(place)
# parce le link skipping the fist (already used by build_place)
for i in range(1, len(marc21.links_from_752)):
place = {
'country': 'und',
'type': 'bf:Place',
'identifyBy': marc21.links_from_752[i]
}
places.append(place)
if places:
publication['place'] = places

publication['statement'] = build_statement(value, ind2)
if subfields_c:
Expand Down Expand Up @@ -859,6 +904,47 @@ def marc21_to_summary(self, key, value):
self['tableOfContents'] = table_of_contents_list


@marc21.over('usageAndAccessPolicy', '^(506|540)..')
@utils.ignore_value
def marc21_to_usage_and_access_policy_from_field_506_540(self, key, value):
"""Get usageAndAccessPolicy from fields: 506, 540."""
subfield_a = not_repetitive(marc21.bib_id, marc21.rero_id,
key, value, 'a', default='').strip()
if subfield_a:
policy = {
'type': 'bf:UsageAndAccessPolicy',
'label': subfield_a
}
usage_and_access_policy = self.get('usageAndAccessPolicy', [])
usage_and_access_policy.append(policy)
return usage_and_access_policy or None


@marc21.over('frequency', '^(310|321)..')
@utils.ignore_value
def marc21_to_frequency_field_310_321(self, key, value):
"""Get frequency from fields: 310, 321."""
subfield_a = not_repetitive(
marc21.bib_id, marc21.rero_id,
key, value, 'a', default='missing_label').strip()
subfield_b = not_repetitive(
marc21.bib_id, marc21.rero_id,
key, value, 'b', default='').strip()

frequency = {
'label': remove_trailing_punctuation(
data=subfield_a,
punctuation=',',
spaced_punctuation=','
)
}
if subfield_b:
frequency['date'] = subfield_b
frequency_list = self.get('frequency', [])
frequency_list.append(frequency)
return frequency_list or None


@marc21.over('dissertation', '^502..')
@utils.for_each_value
@utils.ignore_value
Expand Down Expand Up @@ -1308,7 +1394,7 @@ def marc21_to_identifiedBy_from_field_930(self, key, value):
return identifiedBy or None


@marc21.over('note', '^(500|510|530|545|580)..')
@marc21.over('note', '^(500|510|530|545|555|580)..')
@utils.for_each_value
@utils.ignore_value
def marc21_to_notes_and_original_title(self, key, value):
Expand Down
Loading

0 comments on commit f3ade41

Please sign in to comment.