Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dojson : remove punctuation for work_access_point #3144

Merged
merged 1 commit into from
Nov 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions data/documents_big.json
Original file line number Diff line number Diff line change
Expand Up @@ -89526,7 +89526,7 @@
"type": "bf:Person",
"preferred_name": "Cook, Glen."
},
"title": "L'\u009ceau dort"
"title": "L'eau dort"
},
{
"agent": {
Expand Down Expand Up @@ -98548,7 +98548,7 @@
"type": "bf:Person",
"preferred_name": "Sartori, Luigi."
},
"title": "Il \u009cmistero della salvezza"
"title": "Il mistero della salvezza"
},
{
"agent": {
Expand All @@ -98559,7 +98559,7 @@
"type": "IdRef"
}
},
"title": "La \u009cstoria della salvezza nel cristianesimo evangelico"
"title": "La storia della salvezza nel cristianesimo evangelico"
},
{
"agent": {
Expand Down
13 changes: 11 additions & 2 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1766,11 +1766,20 @@ def update_part(self, value_data, subfield_code, subfield_data):
:param subfield_data: part number or name depending of `subfield_code`
:type subfield_data: str
"""
def remove_last_dot(value):
"""Removes last dot from value if there are no other dots."""
if value.count('.') == 1:
value = value.rstrip('.')
return value

value_data = remove_last_dot(value_data)
if self.part_number_waiting_name:
if subfield_code == self.part_name_code:
self.part_list.append(
dict(partNumber=self.part_number_waiting_name,
partName=value_data)
dict(
partNumber=self.part_number_waiting_name,
partName=value_data
)
)
self.part_number_waiting_name = {}
else:
Expand Down
88 changes: 61 additions & 27 deletions rero_ils/modules/documents/dojson/contrib/marc21tojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,16 +519,14 @@ def build_agent(marc21, key, value):
if value.get('a'):
name = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'a')
agent_data['preferred_name'] = remove_trailing_punctuation(
name)
agent_data['preferred_name'] = remove_trailing_punctuation(name)
# 100|700|240 Person
if key[:3] in ['100', '700']:
agent_data['type'] = 'bf:Person'
if value.get('a'):
name = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'a')
agent_data['preferred_name'] = remove_trailing_punctuation(
name) # name.rstrip('.')
agent_data['preferred_name'] = remove_trailing_punctuation(name)
if value.get('b'):
numeration = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'b')
Expand Down Expand Up @@ -564,10 +562,9 @@ def build_agent(marc21, key, value):
agent_data['conference'] = key[:3] == '711'
if value.get('b'):
subordinate_units = [
subordinate_unit.rstrip('.')
remove_trailing_punctuation(subordinate_unit, ',.')
for subordinate_unit in utils.force_list(value.get('b'))
]

agent_data['subordinate_unit'] = subordinate_units
if value.get('n'):
numbering = not_repetitive(
Expand Down Expand Up @@ -1635,55 +1632,83 @@ def do_work_access_point(marc21, key, value):
title_tag = 't'
agent['type'] = 'bf:Person'
if value.get('a'):
agent['preferred_name'] = remove_trailing_punctuation(
not_repetitive(marc21.bib_id, marc21.bib_id, key, value, 'a'))
preferred_name = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'a')
preferred_name = remove_trailing_punctuation(not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'a',
',.'
)).rstrip('.')
agent['preferred_name'] = preferred_name
if value.get('b'):
agent['numeration'] = remove_trailing_punctuation(
not_repetitive(marc21.bib_id, marc21.bib_id, key, value, 'b'))
if dates := not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'd'):
split_dates = dates.split('-')
if date_of_birth := split_dates[0].strip():
if date_of_birth := split_dates[0].strip().rstrip('.'):
agent['date_of_birth'] = date_of_birth
with contextlib.suppress(Exception):
if date_of_death := split_dates[1].strip():
if date_of_death := split_dates[1].strip().rstrip('.'):
agent['date_of_death'] = date_of_death
if value.get('c'):
agent['qualifier'] = remove_trailing_punctuation(
not_repetitive(marc21.bib_id, marc21.bib_id, key, value, 'c'))
not_repetitive(marc21.bib_id, marc21.bib_id, key, value, 'c')
).rstrip('.')
elif tag == '710':
title_tag = 't'
agent['type'] = 'bf:Organisation'
agent['conference'] = False
if value.get('a'):
agent['preferred_name'] = not_repetitive(
preferred_name = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'a')
preferred_name = remove_trailing_punctuation(not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'a',
',.'
)).rstrip('.')
agent['preferred_name'] = preferred_name
if value.get('b'):
agent['subordinate_unit'] = list(utils.force_list(value.get('b')))
for subordinate_unit in list(utils.force_list(value.get('b'))):
subordinate_unit = remove_trailing_punctuation(
subordinate_unit).rstrip('.')
agent.setdefault('subordinate_unit', [])
agent['subordinate_unit'].append(subordinate_unit)

if agent:
work_access_point['agent'] = agent
if value.get(title_tag):
work_access_point['title'] = not_repetitive(
title = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, title_tag)
work_access_point['title'] = remove_trailing_punctuation(
title, ',.').replace('\u009c', '')
if value.get('f'):
work_access_point['date_of_work'] = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'f')
if value.get('g'):
work_access_point['miscellaneous_information'] = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'g')
work_access_point['miscellaneous_information'] = \
remove_trailing_punctuation(not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'g'), ',.')
if value.get('l'):
language = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'l')
marc21.bib_id, marc21.bib_id, key, value, 'l'
).lstrip('(').rstrip('.').rstrip(')')
lang = language
if lang not in _LANGUAGES:
# try to get alpha3 language:
if iso_language := find(language):
if language not in _LANGUAGES:
if len(language.split('-')) > 1 or language == 'mehrsprachig':
lang = 'mul'
elif iso_language := find(language):
lang = iso_language.get('iso639_2_b')
if lang in _LANGUAGES:
work_access_point['language'] = lang
else:
if lang == 'mul' or lang not in _LANGUAGES:
error_print('WARNING WORK ACCESS POINT LANGUAGE:', marc21.bib_id,
marc21.rero_id, language)
if miscellaneous_information := work_access_point.get(
'miscellaneous_information'):
work_access_point['miscellaneous_information'] = \
f'{miscellaneous_information} | language: {language}'
else:
work_access_point['miscellaneous_information'] = \
f'language: {language}'
part_list = TitlePartList(part_number_code='n', part_name_code='p')
items = get_field_items(value)
index = 1
Expand All @@ -1693,19 +1718,26 @@ def do_work_access_point(marc21, key, value):
if blob_key != '__order__':
index += 1
if the_part_list := part_list.get_part_list():
for part in the_part_list:
if part_name := part.get('partName'):
part['partName'] = remove_trailing_punctuation(part_name)
work_access_point['part'] = the_part_list
if value.get('k'):
work_access_point['form_subdivision'] = list(
utils.force_list(value.get('k')))
for form_subdivision in list(utils.force_list(value.get('k'))):
work_access_point.setdefault('form_subdivision', [])
work_access_point['form_subdivision'].append(
remove_trailing_punctuation(form_subdivision, ',.'))
if value.get('m'):
work_access_point['medium_of_performance_for_music'] = list(
utils.force_list(value.get('m')))
if value.get('o'):
work_access_point['arranged_statement_for_music'] = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'o')
if value.get('r'):
work_access_point['key_for_music'] = not_repetitive(
marc21.bib_id, marc21.bib_id, key, value, 'r')
work_access_point['key_for_music'] = remove_trailing_punctuation(
not_repetitive(marc21.bib_id, marc21.bib_id, key, value, 'r'),
',.'
)
if identifier := build_identifier(value):
agent['identifiedBy'] = identifier

Expand All @@ -1729,7 +1761,9 @@ def do_work_access_point_240(marc21, key, value):
part_selection = {'n', 'p'}
for blob_key, blob_value in get_field_items(value):
if blob_key in {'a'}:
work_access_points['title'] = blob_value
title = remove_trailing_punctuation(
blob_value.replace('\u009c', ''))
work_access_points['title'] = title

if blob_key in part_selection:
part_list.update_part(blob_value, blob_key, blob_value)
Expand Down Expand Up @@ -1901,7 +1935,7 @@ def format_date_b(date):
elif date[0] == 'd':
date = f'+{date[1:]}'
else:
date = f'-{date}'
date = f'+{date}'
date_str = date[0]
year = date[1:5]
if test_min_max(year, 0, 9999):
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/documents/test_documents_dojson.py
Original file line number Diff line number Diff line change
Expand Up @@ -4164,7 +4164,7 @@ def test_marc21_to_part_of_without_link():
]
assert data.get('work_access_point') == [{
'agent': {
'preferred_name': 'Jacq, Christian.',
'preferred_name': 'Jacq, Christian',
'type': 'bf:Person'
},
'title': 'Ramsès'
Expand Down Expand Up @@ -4326,13 +4326,13 @@ def test_marc21_to_part_of_with_multiple_800():
]
assert data.get('work_access_point') == [{
'agent': {
'preferred_name': 'Mirallés, Ana.',
'preferred_name': 'Mirallés, Ana',
'type': 'bf:Person'
},
'title': 'A la recherche de la Licorne'
}, {
'agent': {
'preferred_name': 'Ruiz, Emilio.',
'preferred_name': 'Ruiz, Emilio',
'type': 'bf:Person'
},
'title': 'A la recherche de la Licorne'
Expand Down
7 changes: 4 additions & 3 deletions tests/unit/documents/test_documents_dojson_slsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,12 @@ def test_marc21_to_contribution(mock_get):
'language': 'fre',
'title': 'No morirás'
}, {
'miscellaneous_information': 'language: Coréen',
'part': [{
'partName': 'A.T. et N.T. :',
'partNumber': '000.'
'partName': 'A.T. et N.T.',
'partNumber': '000'
}],
'title': 'Bible.'
'title': 'Bible'
}]

marc21xml = """
Expand Down