Skip to content

Commit

Permalink
Adding subject type mapping condition and an initial implementation o…
Browse files Browse the repository at this point in the history
…f bib/instance mapping for data-import based migration flow.
  • Loading branch information
bltravis committed Dec 17, 2024
1 parent bc7ce1a commit 4e64c27
Show file tree
Hide file tree
Showing 9 changed files with 194 additions and 40 deletions.
7 changes: 3 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "folio_migration_tools"
version = "1.8.18"
version = "1.9.0a1"
description = "A tool allowing you to migrate data from legacy ILS:s (Library systems) into FOLIO LSP"
authors = ["Theodor Tolstoy <[email protected]>", "Lisa Sjögren", "Brooks Travis", "Jeremy Nelson", "Clinton Bradford"]
license = "MIT"
Expand Down Expand Up @@ -33,17 +33,16 @@ profile = "black"

[tool.poetry.dependencies]
python = "^3.9"
folioclient = "^0.61.1"
folioclient = "^0.61.2"
pyhumps = "^3.7.3"
defusedxml = "^0.7.1"
python-dateutil = "^2.8.2"
folio-uuid = "^0.2.8"
pymarc = "^5.2.1"
pymarc = "^5.2.3"
pydantic = "^1.10.2"
argparse-prompt = "^0.0.5"
deepdiff = "^6.2.3"
pyaml = "^21.10.1"
httpx = "^0.27.2"
python-i18n = "^0.3.9"

[tool.poetry.group.dev.dependencies]
Expand Down
3 changes: 3 additions & 0 deletions src/folio_migration_tools/folder_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ def setup_migration_file_structure(self, source_file_type: str = ""):
self.srs_records_path = (
self.results_folder / f"folio_srs_{object_type_string}{self.file_template}.json"
)
self.data_import_marc_path = (
self.results_folder / f"folio_marc_{object_type_string}{self.file_template}.mrc"
)
self.organizations_id_map_path = (
self.results_folder / f"{str(FOLIONamespaces.organizations.name).lower()}_id_map.json"
)
Expand Down
21 changes: 13 additions & 8 deletions src/folio_migration_tools/marc_rules_transformation/conditions.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def setup_reference_data_for_bibs(self):
logging.info("%s\tcontributor_types", len(self.folio.contributor_types))
logging.info("%s\talt_title_types", len(self.folio.alt_title_types))
logging.info("%s\tidentifier_types", len(self.folio.identifier_types))
logging.info("%s\tsubject_types", len(self.folio.subject_types))
# Raise for empty settings
if not self.folio.contributor_types:
raise TransformationProcessError("", "No contributor_types in FOLIO")
Expand All @@ -69,6 +70,8 @@ def setup_reference_data_for_bibs(self):
raise TransformationProcessError("", "No identifier_types in FOLIO")
if not self.folio.alt_title_types:
raise TransformationProcessError("", "No alt_title_types in FOLIO")
if not self.folio.subject_types:
raise TransformationProcessError("", "No subject_types in FOLIO")

# Set defaults
logging.info("Setting defaults")
Expand Down Expand Up @@ -851,12 +854,14 @@ def condition_set_note_staff_only_via_indicator(
return "false"

def condition_set_subject_type_id(self, legacy_id, value, parameter, marc_field: field.Field):
if marc_field.tag not in ["650", "651", "655"]:
self.mapper.migration_report.add(
"SubjectTypeMapping",
(
f"Unhandled MARC tag {marc_field.tag}. Subject Type ID is only mapped "
"from 650, 651 and 655 "
),
try:
t = self.get_ref_data_tuple_by_name(
self.folio.subject_types, "subject_types", parameter["name"]
)
return ""
self.mapper.migration_report.add("MappedSubjectTypes", t[1])
return t[0]
except Exception:
raise TransformationProcessError(
legacy_id,
f"Subject type not found for {parameter['name']} {marc_field}",
)
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
import logging
import os
import sys
import time
import traceback
from typing import List

import i18n
from folio_uuid.folio_namespaces import FOLIONamespaces
from pymarc import Field
from pymarc import Record
from pymarc import Subfield
from pymarc import Field, Record, Subfield

from folio_migration_tools.custom_exceptions import TransformationProcessError
from folio_migration_tools.custom_exceptions import TransformationRecordFailedError
from folio_migration_tools.custom_exceptions import (
TransformationProcessError,
TransformationRecordFailedError,
)
from folio_migration_tools.folder_structure import FolderStructure
from folio_migration_tools.helper import Helper
from folio_migration_tools.library_configuration import FileDefinition
from folio_migration_tools.library_configuration import HridHandling
from folio_migration_tools.library_configuration import FileDefinition, HridHandling
from folio_migration_tools.marc_rules_transformation.rules_mapper_base import (
RulesMapperBase,
)
Expand All @@ -32,6 +32,8 @@ def __init__(
self.created_objects_file = created_objects_file
if mapper.task_configuration.create_source_records:
self.srs_records_file = open(self.folder_structure.srs_records_path, "w+")
if mapper.task_configuration.data_import_marc:
self.data_import_marc_file = open(self.folder_structure.data_import_marc_path, "wb+")
self.unique_001s: set = set()
self.failed_records_count: int = 0
self.records_count: int = 0
Expand Down Expand Up @@ -85,6 +87,12 @@ def process_record(self, idx: int, marc_record: Record, file_def: FileDefinition
legacy_ids,
self.object_type,
)
if self.mapper.task_configuration.data_import_marc:
self.save_marc_record(
marc_record,
folio_rec,
self.object_type
)
Helper.write_to_file(self.created_objects_file, folio_rec)
self.mapper.migration_report.add_general_statistics(
i18n.t("Inventory records written to disk")
Expand Down Expand Up @@ -121,6 +129,19 @@ def process_record(self, idx: int, marc_record: Record, file_def: FileDefinition
):
self.mapper.remove_from_id_map(folio_rec.get("formerIds", []))

def save_marc_record(
self,
marc_record: Record,
folio_rec: dict,
object_type: FOLIONamespaces
):
self.mapper.save_data_import_marc_record(
self.data_import_marc_file,
object_type,
marc_record,
folio_rec,
)

def save_srs_record(
self,
marc_record: Record,
Expand Down Expand Up @@ -246,7 +267,15 @@ def wrap_up(self):
self.mapper.mapped_legacy_fields,
)
if self.mapper.task_configuration.create_source_records:
self.srs_records_file.seek(0)
if not self.srs_records_file.seek(0):
os.remove(self.srs_records_file.name)
self.srs_records_file.close()
if self.mapper.task_configuration.data_import_marc:
self.data_import_marc_file.seek(0)
if not self.data_import_marc_file.read(1):
os.remove(self.data_import_marc_file.name)
self.data_import_marc_file.close()
self.mapper.wrap_up()

logging.info("Transformation report written to %s", report_file.name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,42 @@ def remove_repeated_subfields(marc_field: Field):
subfields=new_subfields,
)

@staticmethod
def save_data_import_marc_record(
data_import_marc_file,
record_type: FOLIONamespaces,
marc_record: Record,
folio_record,
):
"""Saves the source marc_record to a file to be loaded via Data Import
Args:
srs_records_file (_type_): _description_
record_type (FOLIONamespaces): _description_
folio_client (FolioClient): _description_
marc_record (Record): _description_
folio_record (_type_): _description_
legacy_ids (List[str]): _description_
suppress (bool): _description_
"""
marc_record.add_ordered_field(
Field(
tag="999",
indicators=["f", "f"],
subfields=[
Subfield(code="i", value=folio_record["id"]),
],
)
)
# Since they all should be UTF encoded, make the leader align.
try:
marc_record.leader[9] = "a"
except Exception as ee:
logging.exception(
"Something is wrong with the marc record's leader: %s, %s", marc_record.leader, ee
)
data_import_marc_file.write(marc_record.as_marc())

@staticmethod
def save_source_record(
srs_records_file,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ def __init__(
self.instance_relationships: dict = {}
self.instance_relationship_types: dict = {}
self.other_mode_of_issuance_id = get_unspecified_mode_of_issuance(self.folio_client)

self.create_source_records = all(
[self.task_configuration.create_source_records, (not self.task_configuration.data_import_marc)]
)
self.data_import_marc = self.task_configuration.data_import_marc
self.start = time.time()

def perform_initial_preparation(self, marc_record: pymarc.Record, legacy_ids):
Expand All @@ -70,12 +73,13 @@ def perform_initial_preparation(self, marc_record: pymarc.Record, legacy_ids):
str(legacy_ids[-1]),
)
)
self.hrid_handler.handle_hrid(
FOLIONamespaces.instances,
folio_instance,
marc_record,
legacy_ids,
)
if self.create_source_records:
self.hrid_handler.handle_hrid(
FOLIONamespaces.instances,
folio_instance,
marc_record,
legacy_ids,
)
self.handle_leader_05(marc_record, legacy_ids)
if self.task_configuration.add_administrative_notes_with_legacy_ids:
for legacy_id in legacy_ids:
Expand Down Expand Up @@ -113,15 +117,18 @@ def parse_record(
ignored_subsequent_fields: set = set()
bad_tags = set(self.task_configuration.tags_to_delete) # "907"
folio_instance = self.perform_initial_preparation(marc_record, legacy_ids)
for marc_field in marc_record:
self.report_marc_stats(marc_field, bad_tags, legacy_ids, ignored_subsequent_fields)
if marc_field.tag not in ignored_subsequent_fields:
self.process_marc_field(
folio_instance,
marc_field,
ignored_subsequent_fields,
legacy_ids,
)
if self.data_import_marc:
self.simple_bib_map(marc_record, folio_instance, ignored_subsequent_fields, legacy_ids)
else:
for marc_field in marc_record:
self.report_marc_stats(marc_field, bad_tags, legacy_ids, ignored_subsequent_fields)
if marc_field.tag not in ignored_subsequent_fields:
self.process_marc_field(
folio_instance,
marc_field,
ignored_subsequent_fields,
legacy_ids,
)

self.perform_additional_parsing(folio_instance, marc_record, legacy_ids, file_def)
clean_folio_instance = self.validate_required_properties(
Expand All @@ -132,6 +139,21 @@ def parse_record(
self.report_folio_mapping(clean_folio_instance, self.schema)
return [clean_folio_instance]

def simple_bib_map(self, folio_instnace: dict, marc_record: Record, ignored_subsequent_fields: set, legacy_ids: List[str]):
"""
This method applies a much simplified MARC-to-instance
mapping to create a minimal FOLIO Instance record to be
used with a Data Import based MARC loading flow, rather
than creating SRS records during transformation.
Args:
folio_instnace (dict): _description_
marc_record (Record): _description_
legacy_ids (List[str]): _description_
file_def (FileDefinition): _description_
"""
self.process_marc_field(folio_instnace, marc_record['245'], ignored_subsequent_fields, legacy_ids)

def perform_additional_parsing(
self,
folio_instance: dict,
Expand Down Expand Up @@ -220,8 +242,9 @@ def handle_holdings(self, marc_record: Record):

def wrap_up(self):
logging.info("Mapper wrapping up")
if self.task_configuration.update_hrid_settings:
self.hrid_handler.store_hrid_settings()
if self.create_source_records:
if self.task_configuration.update_hrid_settings:
self.hrid_handler.store_hrid_settings()

def get_instance_type_id(self, marc_record, legacy_id):
return_id = ""
Expand Down
11 changes: 11 additions & 0 deletions src/folio_migration_tools/migration_tasks/bibs_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,17 @@ class TaskConfiguration(AbstractTaskConfiguration):
),
),
] = True
data_import_marc: Annotated[
bool,
Field(
title="Generate a MARC file for data import overlay of instances",
description=(
"If set to true, the process will generate a file of binary MARC records that can"
"be imported into FOLIO using the Data Import APIs. If set to false, only a file"
"of FOLIO instance records (and optional SRS records) will be generated."
),
)
] = False
parse_cataloged_date: Annotated[
bool,
Field(
Expand Down
38 changes: 35 additions & 3 deletions tests/test_conditions.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from unittest.mock import Mock
from operator import le
from unittest.mock import Mock, create_autospec, patch

from folioclient import FolioClient
from pymarc import Field, Indicators, Subfield

from folio_migration_tools.marc_rules_transformation.conditions import Conditions
from folio_migration_tools.marc_rules_transformation.rules_mapper_bibs import (
BibsRulesMapper,
)
from folio_migration_tools.migration_report import MigrationReport
from folioclient import FolioClient
from pymarc import Field, Indicators, Subfield
from tests.test_rules_mapper_base import folio_client


def test_condition_trim_period():
Expand Down Expand Up @@ -110,3 +113,32 @@ def test_condition_set_note_staff_only_via_indicator():
mock, legacy_id, "value", {}, marc_field
)
assert res_false == "false"

def test_condition_set_subject_type_id():
mock = create_autospec(Conditions)
parameter = {"name": "Topical term"}
mock.mapper = Mock(spec=BibsRulesMapper)
mock.mapper.migration_report = Mock(spec=MigrationReport)
# mock.get_ref_data_tuple_by_name = Conditions.get_ref_data_tuple_by_name
# mock.get_ref_data_tuple = Conditions.get_ref_data_tuple
mock.folio = Mock(spec=FolioClient)
mock.folio.subject_types = [{'id': 'd6488f88-1e74-40ce-81b5-b19a928ff5b1', 'name': 'Personal name', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff5b2', 'name': 'Corporate name', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff5b3', 'name': 'Meeting name', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff5b4', 'name': 'Uniform title', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff5b5', 'name': 'Named event', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff5b6', 'name': 'Chronological term', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff5b7', 'name': 'Topical term', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff5b8', 'name': 'Geographic name', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff5b9', 'name': 'Uncontrolled', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff510', 'name': 'Faceted topical terms', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff511', 'name': 'Genre/form', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff512', 'name': 'Occupation', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff513', 'name': 'Function', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff514', 'name': 'Curriculum objective', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff515', 'name': 'Hierarchical place name', 'source': 'folio'}, {'id': 'd6488f88-1e74-40ce-81b5-b19a928ff516', 'name': 'Type of entity unspecified', 'source': 'folio'}]
mock.ref_data_dicts = {"subject_types": mock.folio.subject_types}
legacy_id = "legacy_id"
marc_field = Field(
tag="650",
indicators=["0", "0"],
subfields=[
Subfield(code="a", value="Subject 1")
],
)
with patch.object(mock, "get_ref_data_tuple_by_name", return_value=("d6488f88-1e74-40ce-81b5-b19a928ff5b7", "Topical term")):
res = Conditions.condition_set_subject_type_id(mock, legacy_id, "", parameter, marc_field)
assert res == "d6488f88-1e74-40ce-81b5-b19a928ff5b7"

with patch.object(mock, "get_ref_data_tuple", return_value=("d6488f88-1e74-40ce-81b5-b19a928ff5b7", "Topical term")):
res = Conditions.get_ref_data_tuple_by_name(mock, mock.folio.subject_types, "subject_types", parameter["name"])
assert res == ("d6488f88-1e74-40ce-81b5-b19a928ff5b7", "Topical term")

res = Conditions.get_ref_data_tuple(mock, mock.folio.subject_types, "subject_types", "Topical term", "name")
assert res == ("d6488f88-1e74-40ce-81b5-b19a928ff5b7", "Topical term")
16 changes: 16 additions & 0 deletions tests/test_rules_mapper_bibs.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,3 +721,19 @@ def test_should_add_notes_550_556_to_notes_list_2(mapper):
"Disaster recovery : a model plan for libraries and information centers. 0959328971"
in notes
)

def test_simple_bib_map(mapper):
instance = {}
marc_record = pymarc.Record()
marc_record.add_field(
pymarc.Field(
tag="245",
indicators=["0", "0"],
subfields=[
pymarc.Subfield("a", "Modern Electrosynthetic Methods in Organic Chemistry /"),
pymarc.Subfield("b", "Steen Hyldgaard Christensen, Christelle Didier, Andrew Jamison, Martin Meganck, Carl Mitcham, Byron Newberry, editors."),
],
)
)
mapper.simple_bib_map(instance, marc_record, set(), ["legacy_id"])
assert instance["title"] == "Modern Electrosynthetic Methods in Organic Chemistry / Steen Hyldgaard Christensen, Christelle Didier, Andrew Jamison, Martin Meganck, Carl Mitcham, Byron Newberry, editors."

0 comments on commit 4e64c27

Please sign in to comment.