artefactual · replaceafill · Oct 10, 2023 · Oct 8, 2023 · Oct 9, 2023
diff --git a/hack/README.md b/hack/README.md
@@ -2,8 +2,6 @@
 
 - [Audience](#audience)
 - [Requirements](#requirements)
-  - [Docker and Linux](#docker-and-linux)
-  - [Docker and Mac](#docker-and-mac)
   - [Elasticsearch container](#elasticsearch-container)
 - [Installation](#installation)
 - [Web UIs](#web-uis)
@@ -28,16 +26,15 @@
 
 ## Audience
 
-This Archivematica environment is based on Docker Compose [V3][audience-compose-v3]
-and it is specifically **designed for developers**. Compose can be used in a
-production environment but that is beyond the scope of this recipe. Please read
+This Archivematica environment is based on Docker Compose and it is
+specifically **designed for developers**. Compose can be used in a production
+environment but that is beyond the scope of this recipe. Please read
 the [documentation][audience-compose-reference].
 
 Artefactual developers use Docker Compose on Linux heavily so it's important
 that you're familiar with it, and some choices in the configuration of this
 environment break in other operative systems.
 
-[audience-compose-v3]: https://docs.docker.com/compose/compose-file/compose-file-v3/
 [audience-compose-reference]: https://docs.docker.com/compose/reference/overview/
 
 ## Requirements
@@ -63,9 +60,10 @@ am-mysql-1                           551.9MiB / 7.763GiB
 am-clamavd-1                         570MiB / 7.763GiB
 ```
 
-Software dependencies: Docker Engine, Docker Compose V3, git and make. Please
-use a version of Docker Engine greater than 23.0 which includes Buildkit as
-the default builder with support for multi-stage builds.
+Software dependencies: Docker Engine, Docker Compose, git and make. Please use
+a version of Docker Engine greater than 23.0 which includes Buildkit as the
+default builder with support for multi-stage builds and a version of Docker
+Compose greater than 2.17 which supports restarts of dependent services.
 
 It is beyond the scope of this document to explain how these dependencies are
 installed in your computer.

diff --git a/src/MCPClient/lib/clientScripts/restructure_dip_for_content_dm_upload.py b/src/MCPClient/lib/clientScripts/restructure_dip_for_content_dm_upload.py
@@ -139,6 +139,27 @@ def addAipUuidToDcMetadata(dipUuid, dcMetadata):
     return dcMetadata
 
 
+def get_csv_headers(rows):
+    if not rows:
+        return []
+
+    # Use the longest header as the reference.
+    headers = rows[0]["csv_header"]
+    for row in rows[1:]:
+        if headers != row["csv_header"] and len(row["csv_header"]) > len(headers):
+            headers = row["csv_header"]
+
+    # Look for differences in the header of each row and insert them before the
+    # AIP UUID column.
+    for row in rows:
+        if row["csv_header"] != headers:
+            difference = [h for h in row["csv_header"] if h not in headers]
+            idx = headers.index("AIP UUID")
+            headers = headers[:idx] + difference + headers[idx:]
+
+    return headers
+
+
 def generate_project_client_package(
     job, output_dir, package_type, structmap, dmdsecs, dipuuid
 ):
@@ -161,75 +182,75 @@ def generate_project_client_package(
     job.pyprint("Path to the output tabfile", csv_path)
 
     divs_with_dmdsecs = structmap.findall(".//mets:div[@DMDID]", namespaces=ns.NSMAP)
+
+    # Iterate through every div and create a row for each
+    rows = []
+    for div in divs_with_dmdsecs:
+        # Find associated dmdSecs
+        dmdids = div.get("DMDID").split()
+        # Take nonDC dmdSec, fallback to DC dmdSec
+        dmdsecpair = splitDmdSecs(job, [dmdsecs[dmdid] for dmdid in dmdids])
+        dmdsecpair["dc"] = addAipUuidToDcMetadata(dipuuid, dmdsecpair["dc"])
+        metadata = dmdsecpair["nonDc"] or dmdsecpair["dc"]
+        # Skip dmdSecs without metadata
+        if not metadata:
+            continue
+        # Create csv_header and csv_values from the dmdSec metadata
+        csv_header = []
+        csv_values = []
+        for header, value in metadata.items():
+            csv_header.append(header)
+            value = "; ".join(value).replace("\r", "").replace("\n", "")
+            csv_values.append(value)
+
+        # Add AIP UUID
+        csv_header.append("AIP UUID")
+        csv_values.append(dipuuid)
+
+        # Add file UUID
+        csv_header.append("file UUID")
+        if "dirs" in package_type:
+            # Directories have no file UUID
+            csv_values.append("")
+        else:
+            file_uuid = ""
+            fptr = div.find("mets:fptr", namespaces=ns.NSMAP)
+            # Only files have fptrs as direct children
+            if fptr is not None:
+                # File UUID is last 36 characters of FILEID
+                file_uuid = fptr.get("FILEID")[-36:]
+            csv_values.append(file_uuid)
+
+        # Add file or directory name
+        name = div.attrib["LABEL"]  # Fallback if LABEL doesn't exist?
+        if "dirs" in package_type:
+            csv_header.insert(0, "Directory name")
+            csv_values.insert(0, name)
+        else:
+            csv_header.append("Filename")
+            csv_values.append(name)
+
+        rows.append({"csv_header": csv_header, "csv_values": csv_values})
+
+    headers = get_csv_headers(rows)
+
     with open(csv_path, "w") as csv_file:
-        writer = csv.writer(csv_file, delimiter="\t")
-
-        # Iterate through every div and create a row for each
-        csv_header_ref = None
-        for div in divs_with_dmdsecs:
-            # Find associated dmdSecs
-            dmdids = div.get("DMDID").split()
-            # Take nonDC dmdSec, fallback to DC dmdSec
-            dmdsecpair = splitDmdSecs(job, [dmdsecs[dmdid] for dmdid in dmdids])
-            dmdsecpair["dc"] = addAipUuidToDcMetadata(dipuuid, dmdsecpair["dc"])
-            metadata = dmdsecpair["nonDc"] or dmdsecpair["dc"]
-            # Skip dmdSecs without metadata
-            if not metadata:
-                continue
-            # Create csv_header and csv_values from the dmdSec metadata
-            csv_header = []
-            csv_values = []
-            for header, value in metadata.items():
-                csv_header.append(header)
-                value = "; ".join(value).replace("\r", "").replace("\n", "")
-                csv_values.append(value)
-
-            # Add AIP UUID
-            csv_header.append("AIP UUID")
-            csv_values.append(dipuuid)
-
-            # Add file UUID
-            csv_header.append("file UUID")
-            if "dirs" in package_type:
-                # Directories have no file UUID
-                csv_values.append("")
-            else:
-                file_uuid = ""
-                fptr = div.find("mets:fptr", namespaces=ns.NSMAP)
-                # Only files have fptrs as direct children
-                if fptr is not None:
-                    # File UUID is last 36 characters of FILEID
-                    file_uuid = fptr.get("FILEID")[-36:]
-                csv_values.append(file_uuid)
-
-            # Add file or directory name
-            name = div.attrib["LABEL"]  # Fallback if LABEL doesn't exist?
-            if "dirs" in package_type:
-                csv_header.insert(0, "Directory name")
-                csv_values.insert(0, name)
-            else:
-                csv_header.append("Filename")
-                csv_values.append(name)
-
-            # Compare csv_header, if diff ERROR (first time set, write to file)
-            if csv_header_ref and csv_header_ref != csv_header:
-                job.pyprint(
-                    "ERROR headers differ,",
-                    csv_path,
-                    "almost certainly invalid",
-                    file=sys.stderr,
-                )
-                job.pyprint("Reference header:", csv_header_ref, file=sys.stderr)
-                job.pyprint("Differing header:", csv_header, file=sys.stderr)
-                return 1
-            # If first time through, write out header
-            if not csv_header_ref:
-                csv_header_ref = csv_header
-                writer.writerow(csv_header_ref)
-                job.pyprint("Tabfile header:", csv_header)
-            # Write csv_row
+        writer = csv.DictWriter(
+            csv_file, headers, extrasaction="ignore", delimiter="\t"
+        )
+        writer.writerow({header: header for header in headers})
+        job.pyprint("Tabfile header:", headers)
+
+        for row in rows:
+            csv_values = {}
+            for header in headers:
+                if header in row["csv_header"]:
+                    idx = row["csv_header"].index(header)
+                    value = row["csv_values"][idx]
+                    csv_values[header] = value
             writer.writerow(csv_values)
-            job.pyprint("Values:", csv_values)
+            job.pyprint("Values:", [csv_values.get(header, "") for header in headers])
+
     return 0
 
 

diff --git a/src/MCPClient/tests/fixtures/mets_sip_dc_with_optional_columns.xml b/src/MCPClient/tests/fixtures/mets_sip_dc_with_optional_columns.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="windows-1252"?>
+<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd">
+  <mets:metsHdr CREATEDATE="2015-06-15T20:10:16"/>
+  <mets:dmdSec ID="dmdSec_1">
+    <mets:mdWrap MDTYPE="DC">
+      <mets:xmlData>
+        <dcterms:dublincore xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd">
+          <dc:title>Yamani Weapons</dc:title>
+          <dc:creator>Keladry of Mindelan</dc:creator>
+          <dc:subject>Glaives</dc:subject>
+          <dc:subject>Swords</dc:subject>
+          <dc:subject>Blades</dc:subject>
+          <dc:description>Glaives are cool</dc:description>
+          <dc:publisher>Tortall Press</dc:publisher>
+          <dc:contributor>Yuki</dc:contributor>
+          <dc:date>2014</dc:date>
+          <dc:type>Archival Information Package</dc:type>
+          <dc:format>parchement</dc:format>
+          <dc:identifier>42/1</dc:identifier>
+          <dc:source>Numair's library</dc:source>
+          <dc:relation>None</dc:relation>
+          <dc:language>en</dc:language>
+          <dc:rights>Public Domain</dc:rights>
+          <dcterms:isPartOf>AIC#43</dcterms:isPartOf>
+        </dcterms:dublincore>
+      </mets:xmlData>
+    </mets:mdWrap>
+  </mets:dmdSec>
+  <mets:dmdSec ID="dmdSec_2">
+    <mets:mdWrap MDTYPE="DC">
+      <mets:xmlData>
+        <dcterms:dublincore xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd">
+          <dc:title>Evelyn's photo</dc:title>
+          <alternative_title>A photo with Evelyn in it</alternative_title>
+          <dc:creator>Evelyn</dc:creator>
+          <dcterms:isPartOf>AIC#43</dcterms:isPartOf>
+        </dcterms:dublincore>
+      </mets:xmlData>
+    </mets:mdWrap>
+  </mets:dmdSec>
+  <mets:fileSec>
+    <mets:fileGrp USE="original">
+      <mets:file GROUPID="Group-4ad45093-ca36-4ee4-bccd-b82dc502ae53" ID="file-4ad45093-ca36-4ee4-bccd-b82dc502ae53">
+        <mets:FLocat xlink:href="objects/evelyn_s_photo.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM"/>
+      </mets:file>
+    </mets:fileGrp>
+    <mets:fileGrp USE="submissionDocumentation">
+      <mets:file GROUPID="Group-e7bb725f-98f1-481f-b5b1-4c0134b801c3" ID="file-e7bb725f-98f1-481f-b5b1-4c0134b801c3">
+        <mets:FLocat xlink:href="objects/submissionDocumentation/transfer-sip-dc-c6adbd7b-c0e8-480c-ad37-dc887f97e214/METS.xml" LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM"/>
+      </mets:file>
+    </mets:fileGrp>
+    <mets:fileGrp USE="preservation">
+      <mets:file GROUPID="Group-4ad45093-ca36-4ee4-bccd-b82dc502ae53" ID="file-6e3f5f63-8424-417e-8357-f0a1ea05af62">
+        <mets:FLocat xlink:href="objects/evelyn_s_photo-4a4dfb4d-caa3-40ff-99c0-34ed176bb84b.tif" LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM"/>
+      </mets:file>
+    </mets:fileGrp>
+  </mets:fileSec>
+  <mets:structMap ID="structMap_1" LABEL="Archivematica default" TYPE="physical">
+    <mets:div LABEL="sip-dc-eacbf65f-2528-4be0-8cb3-532f45fcdff8" TYPE="Directory">
+      <mets:div LABEL="objects" TYPE="Directory" DMDID="dmdSec_1">
+        <mets:div LABEL="evelyn_s_photo-4a4dfb4d-caa3-40ff-99c0-34ed176bb84b.tif" TYPE="Item" DMDID="dmdSec_2">
+          <mets:fptr FILEID="file-6e3f5f63-8424-417e-8357-f0a1ea05af62"/>
+        </mets:div>
+        <mets:div LABEL="evelyn_s_photo.jpg" TYPE="Item">
+          <mets:fptr FILEID="file-4ad45093-ca36-4ee4-bccd-b82dc502ae53"/>
+        </mets:div>
+        <mets:div LABEL="metadata" TYPE="Directory">
+          <mets:div LABEL="transfers" TYPE="Directory">
+            <mets:div LABEL="sip-dc-c6adbd7b-c0e8-480c-ad37-dc887f97e214" TYPE="Directory"/>
+          </mets:div>
+        </mets:div>
+        <mets:div LABEL="submissionDocumentation" TYPE="Directory">
+          <mets:div LABEL="transfer-sip-dc-c6adbd7b-c0e8-480c-ad37-dc887f97e214" TYPE="Directory">
+            <mets:div LABEL="METS.xml" TYPE="Item">
+              <mets:fptr FILEID="file-e7bb725f-98f1-481f-b5b1-4c0134b801c3"/>
+            </mets:div>
+          </mets:div>
+        </mets:div>
+      </mets:div>
+    </mets:div>
+  </mets:structMap>
+</mets:mets>
diff --git a/src/MCPClient/tests/test_restructure_dip_for_content_dm_upload.py b/src/MCPClient/tests/test_restructure_dip_for_content_dm_upload.py
@@ -25,6 +25,17 @@ def dip_directory(tmpdir):
     return tmpdir
 
 
+@pytest.fixture
+def dip_directory_optional_dc_columns(tmpdir):
+    shutil.copy(
+        os.path.join(THIS_DIR, "fixtures", "mets_sip_dc_with_optional_columns.xml"),
+        str(tmpdir / "METS.a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3.xml"),
+    )
+    (tmpdir / "objects").mkdir()
+
+    return tmpdir
+
+
 def test_restructure_dip_for_content_dm_upload(job, dip_directory):
     job.args = (
         None,
@@ -50,3 +61,37 @@ def test_restructure_dip_for_content_dm_upload(job, dip_directory):
         csv_data[1]
         == "objects	Yamani Weapons	Keladry of Mindelan	Glaives	Glaives are cool	Tortall Press	Yuki	2014	Archival Information Package	parchement	42/1; a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3	Numair's library	None	en	Public Domain	AIC#43	a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3	"
     )
+
+
+def test_restructure_dip_for_content_dm_upload_with_optional_dc_columns(
+    job, dip_directory_optional_dc_columns
+):
+    job.args = (
+        None,
+        "--uuid=a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3",
+        f"--dipDir={dip_directory_optional_dc_columns}",
+    )
+    jobs = [job]
+
+    restructure_dip_for_content_dm_upload.call(jobs)
+    csv_data = (
+        (dip_directory_optional_dc_columns / "objects/compound.txt")
+        .read_text(encoding="utf-8")
+        .splitlines()
+    )
+
+    assert not job.error
+    assert job.get_exit_code() == 0
+
+    assert (
+        csv_data[0]
+        == "title\tcreator\tsubject\tdescription\tpublisher\tcontributor\tdate\ttype\tformat\tidentifier\tsource\trelation\tlanguage\trights\tisPartOf\talternative_title\tAIP UUID\tfile UUID\tFilename"
+    )
+    assert (
+        csv_data[1]
+        == "Yamani Weapons\tKeladry of Mindelan\tGlaives; Swords; Blades\tGlaives are cool\tTortall Press\tYuki\t2014\tArchival Information Package\tparchement\t42/1; a2f1f249-7bd4-4f52-8f1a-84319cb1b6d3\tNumair's library\tNone\ten\tPublic Domain\tAIC#43\t\ta2f1f249-7bd4-4f52-8f1a-84319cb1b6d3\t\tobjects"
+    )
+    assert (
+        csv_data[2]
+        == "Evelyn's photo\tEvelyn\t\t\t\t\t\t\t\ta2f1f249-7bd4-4f52-8f1a-84319cb1b6d3\t\t\t\t\tAIC#43\tA photo with Evelyn in it\ta2f1f249-7bd4-4f52-8f1a-84319cb1b6d3\t6e3f5f63-8424-417e-8357-f0a1ea05af62\tevelyn_s_photo-4a4dfb4d-caa3-40ff-99c0-34ed176bb84b.tif"
+    )