Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix restructuring DIP for CONTENT dm upload #1858

Merged
merged 2 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions hack/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

- [Audience](#audience)
- [Requirements](#requirements)
- [Docker and Linux](#docker-and-linux)
- [Docker and Mac](#docker-and-mac)
- [Elasticsearch container](#elasticsearch-container)
- [Installation](#installation)
- [Web UIs](#web-uis)
Expand All @@ -28,16 +26,15 @@

## Audience

This Archivematica environment is based on Docker Compose [V3][audience-compose-v3]
and it is specifically **designed for developers**. Compose can be used in a
production environment but that is beyond the scope of this recipe. Please read
This Archivematica environment is based on Docker Compose and it is
specifically **designed for developers**. Compose can be used in a production
environment but that is beyond the scope of this recipe. Please read
the [documentation][audience-compose-reference].

Artefactual developers use Docker Compose on Linux heavily so it's important
that you're familiar with it, and some choices in the configuration of this
environment break in other operative systems.

[audience-compose-v3]: https://docs.docker.com/compose/compose-file/compose-file-v3/
[audience-compose-reference]: https://docs.docker.com/compose/reference/overview/

## Requirements
Expand All @@ -63,9 +60,10 @@ am-mysql-1 551.9MiB / 7.763GiB
am-clamavd-1 570MiB / 7.763GiB
```

Software dependencies: Docker Engine, Docker Compose V3, git and make. Please
use a version of Docker Engine greater than 23.0 which includes Buildkit as
the default builder with support for multi-stage builds.
Software dependencies: Docker Engine, Docker Compose, git and make. Please use
a version of Docker Engine greater than 23.0 which includes Buildkit as the
default builder with support for multi-stage builds and a version of Docker
Compose greater than 2.17 which supports restarts of dependent services.

It is beyond the scope of this document to explain how these dependencies are
installed in your computer.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,27 @@ def addAipUuidToDcMetadata(dipUuid, dcMetadata):
return dcMetadata


def get_csv_headers(rows):
if not rows:
return []

# Use the longest header as the reference.
headers = rows[0]["csv_header"]
for row in rows[1:]:
if headers != row["csv_header"] and len(row["csv_header"]) > len(headers):
headers = row["csv_header"]

# Look for differences in the header of each row and insert them before the
# AIP UUID column.
for row in rows:
if row["csv_header"] != headers:
difference = [h for h in row["csv_header"] if h not in headers]
idx = headers.index("AIP UUID")
headers = headers[:idx] + difference + headers[idx:]

return headers


def generate_project_client_package(
job, output_dir, package_type, structmap, dmdsecs, dipuuid
):
Expand All @@ -161,75 +182,75 @@ def generate_project_client_package(
job.pyprint("Path to the output tabfile", csv_path)

divs_with_dmdsecs = structmap.findall(".//mets:div[@DMDID]", namespaces=ns.NSMAP)

# Iterate through every div and create a row for each
rows = []
for div in divs_with_dmdsecs:
# Find associated dmdSecs
dmdids = div.get("DMDID").split()
# Take nonDC dmdSec, fallback to DC dmdSec
dmdsecpair = splitDmdSecs(job, [dmdsecs[dmdid] for dmdid in dmdids])
dmdsecpair["dc"] = addAipUuidToDcMetadata(dipuuid, dmdsecpair["dc"])
metadata = dmdsecpair["nonDc"] or dmdsecpair["dc"]
# Skip dmdSecs without metadata
if not metadata:
continue
# Create csv_header and csv_values from the dmdSec metadata
csv_header = []
csv_values = []
for header, value in metadata.items():
csv_header.append(header)
value = "; ".join(value).replace("\r", "").replace("\n", "")
csv_values.append(value)

# Add AIP UUID
csv_header.append("AIP UUID")
csv_values.append(dipuuid)

# Add file UUID
csv_header.append("file UUID")
if "dirs" in package_type:
# Directories have no file UUID
csv_values.append("")
else:
file_uuid = ""
fptr = div.find("mets:fptr", namespaces=ns.NSMAP)
# Only files have fptrs as direct children
if fptr is not None:
# File UUID is last 36 characters of FILEID
file_uuid = fptr.get("FILEID")[-36:]
csv_values.append(file_uuid)

# Add file or directory name
name = div.attrib["LABEL"] # Fallback if LABEL doesn't exist?
if "dirs" in package_type:
csv_header.insert(0, "Directory name")
csv_values.insert(0, name)
else:
csv_header.append("Filename")
csv_values.append(name)

rows.append({"csv_header": csv_header, "csv_values": csv_values})

headers = get_csv_headers(rows)

with open(csv_path, "w") as csv_file:
writer = csv.writer(csv_file, delimiter="\t")

# Iterate through every div and create a row for each
csv_header_ref = None
for div in divs_with_dmdsecs:
# Find associated dmdSecs
dmdids = div.get("DMDID").split()
# Take nonDC dmdSec, fallback to DC dmdSec
dmdsecpair = splitDmdSecs(job, [dmdsecs[dmdid] for dmdid in dmdids])
dmdsecpair["dc"] = addAipUuidToDcMetadata(dipuuid, dmdsecpair["dc"])
metadata = dmdsecpair["nonDc"] or dmdsecpair["dc"]
# Skip dmdSecs without metadata
if not metadata:
continue
# Create csv_header and csv_values from the dmdSec metadata
csv_header = []
csv_values = []
for header, value in metadata.items():
csv_header.append(header)
value = "; ".join(value).replace("\r", "").replace("\n", "")
csv_values.append(value)

# Add AIP UUID
csv_header.append("AIP UUID")
csv_values.append(dipuuid)

# Add file UUID
csv_header.append("file UUID")
if "dirs" in package_type:
# Directories have no file UUID
csv_values.append("")
else:
file_uuid = ""
fptr = div.find("mets:fptr", namespaces=ns.NSMAP)
# Only files have fptrs as direct children
if fptr is not None:
# File UUID is last 36 characters of FILEID
file_uuid = fptr.get("FILEID")[-36:]
csv_values.append(file_uuid)

# Add file or directory name
name = div.attrib["LABEL"] # Fallback if LABEL doesn't exist?
if "dirs" in package_type:
csv_header.insert(0, "Directory name")
csv_values.insert(0, name)
else:
csv_header.append("Filename")
csv_values.append(name)

# Compare csv_header, if diff ERROR (first time set, write to file)
if csv_header_ref and csv_header_ref != csv_header:
job.pyprint(
"ERROR headers differ,",
csv_path,
"almost certainly invalid",
file=sys.stderr,
)
job.pyprint("Reference header:", csv_header_ref, file=sys.stderr)
job.pyprint("Differing header:", csv_header, file=sys.stderr)
return 1
# If first time through, write out header
if not csv_header_ref:
csv_header_ref = csv_header
writer.writerow(csv_header_ref)
job.pyprint("Tabfile header:", csv_header)
# Write csv_row
writer = csv.DictWriter(
csv_file, headers, extrasaction="ignore", delimiter="\t"
)
writer.writerow({header: header for header in headers})
job.pyprint("Tabfile header:", headers)

for row in rows:
csv_values = {}
for header in headers:
if header in row["csv_header"]:
idx = row["csv_header"].index(header)
value = row["csv_values"][idx]
csv_values[header] = value
writer.writerow(csv_values)
job.pyprint("Values:", csv_values)
job.pyprint("Values:", [csv_values.get(header, "") for header in headers])

return 0


Expand Down
82 changes: 82 additions & 0 deletions src/MCPClient/tests/fixtures/mets_sip_dc_with_optional_columns.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
<?xml version="1.0" encoding="windows-1252"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd">
<mets:metsHdr CREATEDATE="2015-06-15T20:10:16"/>
<mets:dmdSec ID="dmdSec_1">
<mets:mdWrap MDTYPE="DC">
<mets:xmlData>
<dcterms:dublincore xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd">
<dc:title>Yamani Weapons</dc:title>
<dc:creator>Keladry of Mindelan</dc:creator>
<dc:subject>Glaives</dc:subject>
<dc:subject>Swords</dc:subject>
<dc:subject>Blades</dc:subject>
<dc:description>Glaives are cool</dc:description>
<dc:publisher>Tortall Press</dc:publisher>
<dc:contributor>Yuki</dc:contributor>
<dc:date>2014</dc:date>
<dc:type>Archival Information Package</dc:type>
<dc:format>parchement</dc:format>
<dc:identifier>42/1</dc:identifier>
<dc:source>Numair's library</dc:source>
<dc:relation>None</dc:relation>
<dc:language>en</dc:language>
<dc:rights>Public Domain</dc:rights>
<dcterms:isPartOf>AIC#43</dcterms:isPartOf>
</dcterms:dublincore>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:dmdSec ID="dmdSec_2">
<mets:mdWrap MDTYPE="DC">
<mets:xmlData>
<dcterms:dublincore xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd">
<dc:title>Evelyn's photo</dc:title>
<alternative_title>A photo with Evelyn in it</alternative_title>
<dc:creator>Evelyn</dc:creator>
<dcterms:isPartOf>AIC#43</dcterms:isPartOf>
</dcterms:dublincore>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:fileSec>
<mets:fileGrp USE="original">
<mets:file GROUPID="Group-4ad45093-ca36-4ee4-bccd-b82dc502ae53" ID="file-4ad45093-ca36-4ee4-bccd-b82dc502ae53">
<mets:FLocat xlink:href="objects/evelyn_s_photo.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="submissionDocumentation">
<mets:file GROUPID="Group-e7bb725f-98f1-481f-b5b1-4c0134b801c3" ID="file-e7bb725f-98f1-481f-b5b1-4c0134b801c3">
<mets:FLocat xlink:href="objects/submissionDocumentation/transfer-sip-dc-c6adbd7b-c0e8-480c-ad37-dc887f97e214/METS.xml" LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="preservation">
<mets:file GROUPID="Group-4ad45093-ca36-4ee4-bccd-b82dc502ae53" ID="file-6e3f5f63-8424-417e-8357-f0a1ea05af62">
<mets:FLocat xlink:href="objects/evelyn_s_photo-4a4dfb4d-caa3-40ff-99c0-34ed176bb84b.tif" LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap ID="structMap_1" LABEL="Archivematica default" TYPE="physical">
<mets:div LABEL="sip-dc-eacbf65f-2528-4be0-8cb3-532f45fcdff8" TYPE="Directory">
<mets:div LABEL="objects" TYPE="Directory" DMDID="dmdSec_1">
<mets:div LABEL="evelyn_s_photo-4a4dfb4d-caa3-40ff-99c0-34ed176bb84b.tif" TYPE="Item" DMDID="dmdSec_2">
<mets:fptr FILEID="file-6e3f5f63-8424-417e-8357-f0a1ea05af62"/>
</mets:div>
<mets:div LABEL="evelyn_s_photo.jpg" TYPE="Item">
<mets:fptr FILEID="file-4ad45093-ca36-4ee4-bccd-b82dc502ae53"/>
</mets:div>
<mets:div LABEL="metadata" TYPE="Directory">
<mets:div LABEL="transfers" TYPE="Directory">
<mets:div LABEL="sip-dc-c6adbd7b-c0e8-480c-ad37-dc887f97e214" TYPE="Directory"/>
</mets:div>
</mets:div>
<mets:div LABEL="submissionDocumentation" TYPE="Directory">
<mets:div LABEL="transfer-sip-dc-c6adbd7b-c0e8-480c-ad37-dc887f97e214" TYPE="Directory">
<mets:div LABEL="METS.xml" TYPE="Item">
<mets:fptr FILEID="file-e7bb725f-98f1-481f-b5b1-4c0134b801c3"/>
</mets:div>
</mets:div>
</mets:div>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
45 changes: 45 additions & 0 deletions src/MCPClient/tests/test_restructure_dip_for_content_dm_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ def dip_directory(tmpdir):
return tmpdir


@pytest.fixture
def dip_directory_optional_dc_columns(tmpdir):
shutil.copy(
os.path.join(THIS_DIR, "fixtures", "mets_sip_dc_with_optional_columns.xml"),
str(tmpdir / "METS.a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3.xml"),
)
(tmpdir / "objects").mkdir()

return tmpdir


def test_restructure_dip_for_content_dm_upload(job, dip_directory):
job.args = (
None,
Expand All @@ -50,3 +61,37 @@ def test_restructure_dip_for_content_dm_upload(job, dip_directory):
csv_data[1]
== "objects Yamani Weapons Keladry of Mindelan Glaives Glaives are cool Tortall Press Yuki 2014 Archival Information Package parchement 42/1; a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3 Numair's library None en Public Domain AIC#43 a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3 "
)


def test_restructure_dip_for_content_dm_upload_with_optional_dc_columns(
job, dip_directory_optional_dc_columns
):
job.args = (
None,
"--uuid=a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3",
f"--dipDir={dip_directory_optional_dc_columns}",
)
jobs = [job]

restructure_dip_for_content_dm_upload.call(jobs)
csv_data = (
(dip_directory_optional_dc_columns / "objects/compound.txt")
.read_text(encoding="utf-8")
.splitlines()
)

assert not job.error
assert job.get_exit_code() == 0

assert (
csv_data[0]
== "title\tcreator\tsubject\tdescription\tpublisher\tcontributor\tdate\ttype\tformat\tidentifier\tsource\trelation\tlanguage\trights\tisPartOf\talternative_title\tAIP UUID\tfile UUID\tFilename"
)
assert (
csv_data[1]
== "Yamani Weapons\tKeladry of Mindelan\tGlaives; Swords; Blades\tGlaives are cool\tTortall Press\tYuki\t2014\tArchival Information Package\tparchement\t42/1; a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3\tNumair's library\tNone\ten\tPublic Domain\tAIC#43\t\ta2f1f249-7bd4-4f52-8f1a-84319cb1b6d3\t\tobjects"
)
assert (
csv_data[2]
== "Evelyn's photo\tEvelyn\t\t\t\t\t\t\t\ta2f1f249-7bd4-4f52-8f1a-84319cb1b6d3\t\t\t\t\tAIC#43\tA photo with Evelyn in it\ta2f1f249-7bd4-4f52-8f1a-84319cb1b6d3\t6e3f5f63-8424-417e-8357-f0a1ea05af62\tevelyn_s_photo-4a4dfb4d-caa3-40ff-99c0-34ed176bb84b.tif"
)