From 3f0673fa10876ec5cf1e2f25bf3401d79a65f6dd Mon Sep 17 00:00:00 2001 From: David Juhasz Date: Thu, 5 Sep 2024 17:40:17 -0700 Subject: [PATCH] Add checksum verification to "verify manifest" Fixes #40. - Switch to `manifest.Files()` to parse SFA manifest files more efficiently - Add checksum verification to the "verify manifest" activity - Move "internal/workflow/testdata/little-Test-AIP-Digitization" to "internal/testdata/little-Test-AIP-Digitization" - Update "tesdata/little-Test-AIP-Digitization" file hashes to match file contents - Add a verify checksums event to the preprocessing workflow --- go.mod | 2 - go.sum | 5 - internal/activities/verify_manifest.go | 168 +++++++++++++----- internal/activities/verify_manifest_test.go | 153 +++++++++------- .../additional/UpdatedAreldaMetadata.xml | 52 +++--- .../content/content/d_0000001/00000001.jp2 | Bin .../content/d_0000001/00000001_PREMIS.xml | 0 .../content/content/d_0000001/00000002.jp2 | Bin .../content/d_0000001/00000002_PREMIS.xml | 0 .../Prozess_Digitalisierung_PREMIS.xml | 0 .../content/header/old/SIP/metadata.xml | 0 .../content/header/xsd/ablieferung.xsd | 0 .../content/header/xsd/archivischeNotiz.xsd | 0 .../header/xsd/archivischerVorgang.xsd | 0 .../content/header/xsd/arelda.xsd | 0 .../content/header/xsd/base.xsd | 0 .../content/header/xsd/datei.xsd | 0 .../content/header/xsd/dokument.xsd | 0 .../content/header/xsd/dossier.xsd | 0 .../content/header/xsd/ordner.xsd | 0 .../content/header/xsd/ordnungssystem.xsd | 0 .../header/xsd/ordnungssystemposition.xsd | 0 .../content/header/xsd/paket.xsd | 0 .../content/header/xsd/provenienz.xsd | 0 .../content/header/xsd/zusatzDaten.xsd | 0 internal/workflow/preprocessing.go | 26 ++- internal/workflow/preprocessing_test.go | 23 ++- 27 files changed, 270 insertions(+), 159 deletions(-) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/additional/UpdatedAreldaMetadata.xml (98%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001.jp2 (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001_PREMIS.xml (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002.jp2 (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002_PREMIS.xml (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/content/d_0000001/Prozess_Digitalisierung_PREMIS.xml (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/old/SIP/metadata.xml (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/ablieferung.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischeNotiz.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischerVorgang.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/arelda.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/base.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/datei.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/dokument.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/dossier.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/ordner.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystem.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystemposition.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/paket.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/provenienz.xsd (100%) rename internal/{workflow => }/testdata/little-Test-AIP-Digitization/content/header/xsd/zusatzDaten.xsd (100%) diff --git a/go.mod b/go.mod index eaec2a2a..c70415b9 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,6 @@ module github.com/artefactual-sdps/preprocessing-sfa go 1.22.6 require ( - github.com/antchfx/xmlquery v1.4.1 github.com/artefactual-sdps/temporal-activities v0.0.0-20240821162351-47302711bc7b github.com/beevik/etree v1.4.0 github.com/deckarep/golang-set/v2 v2.6.0 @@ -23,7 +22,6 @@ require ( ) require ( - github.com/antchfx/xpath v1.3.1 // indirect github.com/aws/aws-sdk-go v1.55.5 // indirect github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect diff --git a/go.sum b/go.sum index 99457f6a..3e5c48a7 100644 --- a/go.sum +++ b/go.sum @@ -13,10 +13,6 @@ cloud.google.com/go/iam v1.1.13/go.mod h1:K8mY0uSXwEXS30KrnVb+j54LB/ntfZu1dr+4zF cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs= cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/antchfx/xmlquery v1.4.1 h1:YgpSwbeWvLp557YFTi8E3z6t6/hYjmFEtiEKbDfEbl0= -github.com/antchfx/xmlquery v1.4.1/go.mod h1:lKezcT8ELGt8kW5L+ckFMTbgdR61/odpPgDv8Gvi1fI= -github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk= -github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/artefactual-sdps/temporal-activities v0.0.0-20240821162351-47302711bc7b h1:kTOc2pbkdII6/Z84Bus1q52z5KAOaT8vLpfRoOs1l1I= github.com/artefactual-sdps/temporal-activities v0.0.0-20240821162351-47302711bc7b/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug= github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= @@ -309,7 +305,6 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= diff --git a/internal/activities/verify_manifest.go b/internal/activities/verify_manifest.go index 3aad55a4..f53d2b2f 100644 --- a/internal/activities/verify_manifest.go +++ b/internal/activities/verify_manifest.go @@ -2,16 +2,19 @@ package activities import ( "context" + "crypto/md5" // #nosec: 501 -- not used for security. + "encoding/hex" "fmt" + "io" "io/fs" "os" "path/filepath" "slices" - "github.com/antchfx/xmlquery" goset "github.com/deckarep/golang-set/v2" "github.com/artefactual-sdps/preprocessing-sfa/internal/enums" + "github.com/artefactual-sdps/preprocessing-sfa/internal/manifest" "github.com/artefactual-sdps/preprocessing-sfa/internal/sip" ) @@ -23,7 +26,10 @@ type ( SIP sip.SIP } VerifyManifestResult struct { - Failures []string + Failed bool + ChecksumFailures []string + MissingFiles []string + UnexpectedFiles []string } ) @@ -39,76 +45,56 @@ func (a *VerifyManifest) Execute(ctx context.Context, params *VerifyManifestPara if err != nil { return nil, fmt.Errorf("verify manifest: parse manifest: %v", err) } + manifestSet := goset.NewSetFromMapKeys(manifestFiles) sipFiles, err := sipFiles(params.SIP) if err != nil { return nil, fmt.Errorf("verify manifest: get SIP contents: %v", err) } - var failures []string - - if s := manifestFiles.Difference(sipFiles).ToSlice(); len(s) > 0 { - slices.Sort(s) - for _, p := range s { - failures = append(failures, fmt.Sprintf("Missing file: %s", p)) - } + badChecksums, err := verifyChecksums(manifestFiles, sipFiles, params.SIP.Path) + if err != nil { + return nil, fmt.Errorf("verify checksums: %v", err) } - if s := sipFiles.Difference(manifestFiles).ToSlice(); len(s) > 0 { - slices.Sort(s) - for _, p := range s { - failures = append(failures, fmt.Sprintf("Unexpected file: %s", p)) - } - } + missing := missingFiles(manifestSet, sipFiles) + unexpected := unexpectedFiles(manifestSet, sipFiles) - return &VerifyManifestResult{Failures: failures}, nil + return &VerifyManifestResult{ + Failed: len(missing) > 0 || len(unexpected) > 0 || len(badChecksums) > 0, + ChecksumFailures: badChecksums, + MissingFiles: missing, + UnexpectedFiles: unexpected, + }, nil } -// manifestFiles returns the set of all files paths listed in a SIP's manifest. -func manifestFiles(s sip.SIP) (goset.Set[string], error) { +// manifestFiles parses the SIP manifest and returns a map of file paths +// (relative to the SIP root directory) to file checksums. +func manifestFiles(s sip.SIP) (map[string]*manifest.Checksum, error) { f, err := os.Open(s.ManifestPath) if err != nil { return nil, fmt.Errorf("open: %v", err) } - doc, err := xmlquery.Parse(f) + files, err := manifest.Files(f) if err != nil { - return nil, fmt.Errorf("parse document: %v", err) - } - - manifest, err := xmlquery.Query(doc, "//paket/inhaltsverzeichnis") - if err != nil || manifest == nil { - return nil, fmt.Errorf("missing inhaltsverzeichnis entry: %v", err) + return nil, err } - root := "" + // Prefix "content/" to digitized AIP file paths. if s.Type == enums.SIPTypeDigitizedAIP { - root = "content" - } - - return walkNode(manifest, root), nil -} - -// walkNode recursively walks node's xpath tree and returns the set of all file -// (excluding directories) paths found. -func walkNode(node *xmlquery.Node, path string) goset.Set[string] { - paths := goset.NewSet[string]() - - for _, n := range node.SelectElements("ordner") { - name := n.SelectElement("name").InnerText() - paths = paths.Union(walkNode(n, filepath.Join(path, name))) - } - - for _, n := range node.SelectElements("datei") { - name := n.SelectElement("name").InnerText() - paths.Add(filepath.Join(path, name)) + m := make(map[string]*manifest.Checksum, len(files)) + for k, v := range files { + m[filepath.Join("content", k)] = v + } + files = m } - return paths + return files, nil } // sipFiles recursively walks dir's tree and returns the set of all file -// (excluding directories) paths found. +// (excluding directory) paths found. func sipFiles(s sip.SIP) (goset.Set[string], error) { root := s.Path if s.Type == enums.SIPTypeDigitizedAIP { @@ -130,8 +116,8 @@ func sipFiles(s sip.SIP) (goset.Set[string], error) { return err } - // Digitized SIP and born-digital SIPs don't include metadata.xml in the - // manifest, so ignore the file here. + // Digitized SIPs and born-digital SIPs don't include metadata.xml in + // the manifest, so ignore the file here. if s.Type != enums.SIPTypeDigitizedAIP && p == "header/metadata.xml" { return nil } @@ -146,3 +132,87 @@ func sipFiles(s sip.SIP) (goset.Set[string], error) { return paths, nil } + +// missingFiles returns the list of all files that are in manifest but not +// filesys. +func missingFiles(manifest, filesys goset.Set[string]) []string { + var missing []string + if s := manifest.Difference(filesys).ToSlice(); len(s) > 0 { + slices.Sort(s) + for _, p := range s { + missing = append(missing, fmt.Sprintf("Missing file: %s", p)) + } + } + return missing +} + +// unexpectedFiles returns the list of all files that are in filesys but not +// manifest. +func unexpectedFiles(manifest, filesys goset.Set[string]) []string { + var unexpected []string + if s := filesys.Difference(manifest).ToSlice(); len(s) > 0 { + slices.Sort(s) + for _, p := range s { + unexpected = append(unexpected, fmt.Sprintf("Unexpected file: %s", p)) + } + } + return unexpected +} + +// verifyChecksums checks that each manifestFiles file checksum matches the +// checksum generated from the actual file contents. If a file is on the +// manifest but missing from the filesystem, or vice versa, it will be skipped +// with no validation message. The root is the absolute path to the root +// directory of the SIP, and is prefixed to each relative file path in the +// manifest to create an absolute path the file. +func verifyChecksums( + manifestFiles map[string]*manifest.Checksum, + sipFiles goset.Set[string], + root string, +) ([]string, error) { + var failures []string + + for path, checksum := range manifestFiles { + // Check if file exists on filesystem. + if !sipFiles.Contains(path) { + continue + } + + // Generate checksum from filesystem file contents. + switch checksum.Algorithm { + case "MD5": + hash, err := md5Hash(filepath.Join(root, path)) + if err != nil { + return nil, fmt.Errorf("generate MD5 hash: %v", err) + } + if hash != checksum.Hash { + failures = append( + failures, + fmt.Sprintf("Checksum mismatch for %q (expected: %q, got: %q)", path, checksum.Hash, hash), + ) + } + default: + return nil, fmt.Errorf("hash algorithm %q is not supported", checksum.Algorithm) + } + } + slices.Sort(failures) + + return failures, nil +} + +// md5Hash returns a hexadecimal encoded hash string generated from the contents +// of the file at path. +func md5Hash(path string) (string, error) { + f, err := os.Open(path) // #nosec: G304 -- trusted path. + if err != nil { + return "", fmt.Errorf("open file: %v", err) + } + defer f.Close() + + h := md5.New() // #nosec: G401 -- not used for security. + if _, err := io.Copy(h, f); err != nil { + return "", fmt.Errorf("copy contents: %v", err) + } + + return hex.EncodeToString(h.Sum(nil)), nil +} diff --git a/internal/activities/verify_manifest_test.go b/internal/activities/verify_manifest_test.go index 012c3e87..326faf58 100644 --- a/internal/activities/verify_manifest_test.go +++ b/internal/activities/verify_manifest_test.go @@ -36,7 +36,7 @@ const ( metadata.xml metadata.xml MD5 - 43c533d499c572fca699e77e06295ba3 + 636351dce76b47b3d40712813b9a34f3 @@ -46,7 +46,7 @@ const ( arelda.xsd arelda.xsd MD5 - f8454632e1ebf97e0aa8d9527ce2641f + 661c2df1b1e76d1446e90a54816d91ae @@ -58,31 +58,31 @@ const ( 00000001_PREMIS.xml 00000001_PREMIS.xml MD5 - 1428a269ff4e5b4894793b68646984b7 + e80b5017098950fc58aad83c8c14978e 00000002_PREMIS.xml 00000002_PREMIS.xml MD5 - f338f61911d2620972b0ac668dcc37ec + 33f12195e0fc136bc17de332c6b92b0d Prozess_Digitalisierung_PREMIS.xml Prozess_Digitalisierung_PREMIS.xml MD5 - 8067daaa900eba6dace69572eea8f8f3 + 816cabd1c0334ed363555889d9f4dbe4 00000001.jp2 00000001.jp2 MD5 - f7dc1f76a55cbdca0ae4a6dc8ae64644 + 827ccb0eea8a706c4c34a16891f84e7b 00000002.jp2 00000002.jp2 MD5 - 954d06be4a70c188b6b2e5fe4309fb2c + 1e01ba3e07ac48cbdab2d3284d1dd0fa @@ -107,7 +107,7 @@ const ( arelda.xsd arelda.xsd MD5 - f8454632e1ebf97e0aa8d9527ce2641f + 661c2df1b1e76d1446e90a54816d91ae @@ -121,31 +121,31 @@ const ( 00000001.jp2 00000001.jp2 MD5 - dc29291d0e2a18363d0efd2ec2fe81c9 + 827ccb0eea8a706c4c34a16891f84e7b 00000002.jp2 00000002.jp2 MD5 - 9093907ec32f06fe595e0f14982c4bf0 + 1e01ba3e07ac48cbdab2d3284d1dd0fa 00000001_PREMIS.xml 00000001_PREMIS.xml MD5 - 1d310772d26138a42eb2d6bebb637457 + e80b5017098950fc58aad83c8c14978e 00000002_PREMIS.xml 00000002_PREMIS.xml MD5 - abe7d286e9fa8db7ab8a3078df761c8e + 33f12195e0fc136bc17de332c6b92b0d Prozess_Digitalisierung_PREMIS.xml Prozess_Digitalisierung_PREMIS.xml MD5 - 21d8e90afdefd2c43386ca1d1658cab0 + 816cabd1c0334ed363555889d9f4dbe4 @@ -154,9 +154,9 @@ const ( ` ) -func testSIP(t *testing.T, dir *fs.Dir) sip.SIP { +func testSIP(t *testing.T, path string) sip.SIP { t.Helper() - s, err := sip.New(dir.Path()) + s, err := sip.New(path) if err != nil { t.Fatalf("sip: New(): %v", err) } @@ -175,35 +175,7 @@ func TestVerifyManifest(t *testing.T) { { name: "Verifies a digitized AIP manifest", params: activities.VerifyManifestParams{ - SIP: testSIP( - t, - fs.NewDir(t, "Test_Digitized_AIP", - fs.WithDir("additional", - fs.WithFile("UpdatedAreldaMetadata.xml", aipManifest), - ), - fs.WithDir("content", - fs.WithDir("content", - fs.WithDir("d_0000001", - fs.WithFile("00000001.jp2", ""), - fs.WithFile("00000001_PREMIS.xml", ""), - fs.WithFile("00000002.jp2", ""), - fs.WithFile("00000002_PREMIS.xml", ""), - fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", ""), - ), - ), - fs.WithDir("header", - fs.WithDir("old", - fs.WithDir("SIP", - fs.WithFile("metadata.xml", ""), - ), - ), - fs.WithDir("xsd", - fs.WithFile("arelda.xsd", ""), - ), - ), - ), - ), - ), + SIP: testSIP(t, "../testdata/little-Test-AIP-Digitization"), }, want: activities.VerifyManifestResult{}, }, @@ -215,20 +187,20 @@ func TestVerifyManifest(t *testing.T) { fs.NewDir(t, "Test_Digitized_SIP", fs.WithDir("content", fs.WithDir("d_0000001", - fs.WithFile("00000001.jp2", ""), - fs.WithFile("00000001_PREMIS.xml", ""), - fs.WithFile("00000002.jp2", ""), - fs.WithFile("00000002_PREMIS.xml", ""), - fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", ""), + fs.WithFile("00000001.jp2", "12345"), + fs.WithFile("00000001_PREMIS.xml", "abcdef"), + fs.WithFile("00000002.jp2", "67890"), + fs.WithFile("00000002_PREMIS.xml", "ghijk"), + fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", "lmnop"), ), ), fs.WithDir("header", fs.WithDir("xsd", - fs.WithFile("arelda.xsd", ""), + fs.WithFile("arelda.xsd", "vwxyz"), ), fs.WithFile("metadata.xml", sipManifest), ), - ), + ).Path(), ), }, want: activities.VerifyManifestResult{}, @@ -245,26 +217,27 @@ func TestVerifyManifest(t *testing.T) { fs.WithDir("content", fs.WithDir("content", fs.WithDir("d_0000001", - // fs.WithFile("00000001.jp2", ""), - fs.WithFile("00000001_PREMIS.xml", ""), - fs.WithFile("00000002.jp2", ""), - fs.WithFile("00000002_PREMIS.xml", ""), - fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", ""), + // fs.WithFile("00000001.jp2", "12345"), + fs.WithFile("00000001_PREMIS.xml", "abcdef"), + fs.WithFile("00000002.jp2", "67890"), + fs.WithFile("00000002_PREMIS.xml", "ghijk"), + fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", "lmnop"), ), ), fs.WithDir("header", fs.WithDir("old", fs.WithDir("SIP", - fs.WithFile("metadata.xml", ""), + fs.WithFile("metadata.xml", "qrstu"), ), ), ), ), - ), + ).Path(), ), }, want: activities.VerifyManifestResult{ - Failures: []string{ + Failed: true, + MissingFiles: []string{ "Missing file: content/content/d_0000001/00000001.jp2", "Missing file: content/header/xsd/arelda.xsd", }, @@ -283,35 +256,77 @@ func TestVerifyManifest(t *testing.T) { fs.WithDir("content", fs.WithDir("d_0000001", fs.WithFile("extra_file.txt", "I'm an extra file."), - fs.WithFile("00000001.jp2", ""), - fs.WithFile("00000001_PREMIS.xml", ""), - fs.WithFile("00000002.jp2", ""), - fs.WithFile("00000002_PREMIS.xml", ""), - fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", ""), + fs.WithFile("00000001.jp2", "12345"), + fs.WithFile("00000001_PREMIS.xml", "abcdef"), + fs.WithFile("00000002.jp2", "67890"), + fs.WithFile("00000002_PREMIS.xml", "ghijk"), + fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", "lmnop"), ), ), fs.WithDir("header", fs.WithDir("old", fs.WithDir("SIP", - fs.WithFile("metadata.xml", ""), + fs.WithFile("metadata.xml", "qrstu"), ), ), fs.WithDir("xsd", - fs.WithFile("arelda.xsd", ""), - fs.WithFile("extra.xsd", ""), + fs.WithFile("arelda.xsd", "vwxyz"), + fs.WithFile("extra.xsd", "I'm an extra XSD file."), ), ), ), - ), + ).Path(), ), }, want: activities.VerifyManifestResult{ - Failures: []string{ + Failed: true, + UnexpectedFiles: []string{ "Unexpected file: content/content/d_0000001/extra_file.txt", "Unexpected file: content/header/xsd/extra.xsd", }, }, }, + { + name: "Returns a list of mismatched checksums", + params: activities.VerifyManifestParams{ + SIP: testSIP( + t, + fs.NewDir(t, "Test_Extra_Files", + fs.WithDir("additional", + fs.WithFile("UpdatedAreldaMetadata.xml", aipManifest), + ), + fs.WithDir("content", + fs.WithDir("content", + fs.WithDir("d_0000001", + fs.WithFile("00000001.jp2", "wrong checksum"), + fs.WithFile("00000001_PREMIS.xml", "abcdef"), + fs.WithFile("00000002.jp2", "67890"), + fs.WithFile("00000002_PREMIS.xml", "ghijk"), + fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", "lmnop"), + ), + ), + fs.WithDir("header", + fs.WithDir("old", + fs.WithDir("SIP", + fs.WithFile("metadata.xml", "also wrong checksum"), + ), + ), + fs.WithDir("xsd", + fs.WithFile("arelda.xsd", "vwxyz"), + ), + ), + ), + ).Path(), + ), + }, + want: activities.VerifyManifestResult{ + Failed: true, + ChecksumFailures: []string{ + `Checksum mismatch for "content/content/d_0000001/00000001.jp2" (expected: "827ccb0eea8a706c4c34a16891f84e7b", got: "2714364e3a0ac68e8bf9b898b31ff303")`, + `Checksum mismatch for "content/header/old/SIP/metadata.xml" (expected: "636351dce76b47b3d40712813b9a34f3", got: "dff24b6a34ff7ab645cb477e090bee5f")`, + }, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/additional/UpdatedAreldaMetadata.xml b/internal/testdata/little-Test-AIP-Digitization/additional/UpdatedAreldaMetadata.xml similarity index 98% rename from internal/workflow/testdata/little-Test-AIP-Digitization/additional/UpdatedAreldaMetadata.xml rename to internal/testdata/little-Test-AIP-Digitization/additional/UpdatedAreldaMetadata.xml index 6f8b7ae1..55212814 100644 --- a/internal/workflow/testdata/little-Test-AIP-Digitization/additional/UpdatedAreldaMetadata.xml +++ b/internal/testdata/little-Test-AIP-Digitization/additional/UpdatedAreldaMetadata.xml @@ -21,7 +21,7 @@ metadata.xml metadata.xml MD5 - 43c533d499c572fca699e77e06295ba3 + 29a08a4c20e931bf91079abb6951c79e @@ -121,13 +121,13 @@ 00000001_PREMIS.xml 00000001_PREMIS.xml MD5 - 1428a269ff4e5b4894793b68646984b7 + 11f791c655cae7f4ab1dcaf613314151 00000002_PREMIS.xml 00000002_PREMIS.xml MD5 - f338f61911d2620972b0ac668dcc37ec + 4ed338cc69d7635450b5332fb727d2d3 Prozess_Digitalisierung_PREMIS.xml @@ -155,7 +155,7 @@ Bundesverwaltung (Bern) 1000/893_3251903 - + Bundesverwaltung (k.A.) @@ -176,14 +176,14 @@ Beschwerde von Pfyffer-Zimmermann aus Witznau und Zurücknahme derselbigen digital - - + + - + 1874-04-01 - + 1874-11-30 @@ -191,7 +191,7 @@ U_m_s_c_h_l_a_g_0000001 digital - + 000001 _miEf29GTkFR7ymi91IV4fO @@ -211,7 +211,7 @@ D_o_k_u_m_e_n_t_0000001 digital - + 000002 _g1rji2dQsuGpZLBIaAiMPA @@ -303,7 +303,7 @@ D_o_k_u_m_e_n_t_0000002 digital - + 000003 _5GbCxR3sghNXT8xA5s9HyZ @@ -337,7 +337,7 @@ D_o_k_u_m_e_n_t_0000003 digital - + 000004 _RO2p692zkgdsiRQMmkQArf @@ -379,7 +379,7 @@ D_o_k_u_m_e_n_t_0000004 digital - + 000005 _MMXDGhijq4qJC2aopeBET2 @@ -417,7 +417,7 @@ D_o_k_u_m_e_n_t_0000005 digital - + 000006 _KZ4jMOlQzsRx7CaeDFivy5 @@ -469,7 +469,7 @@ D_o_k_u_m_e_n_t_0000006 digital - + 000007 _fqsPyR6gDpgU5Z3ArCJNeM @@ -491,7 +491,7 @@ D_o_k_u_m_e_n_t_0000007 digital - + 000008 _RqkBRp1fesebvsPIEiUNQd @@ -503,7 +503,7 @@ D_o_k_u_m_e_n_t_0000008 digital - + 000009 _Xb7BEeQlYVD5iKjhN002VT @@ -521,7 +521,7 @@ D_o_k_u_m_e_n_t_0000009 digital - + 000010 _kgiVYxgtd2scGDhIt8dozW @@ -547,7 +547,7 @@ D_o_k_u_m_e_n_t_0000010 digital - + 000011 _TiPs089ulN6kLu00MwFiA5 @@ -561,7 +561,7 @@ D_o_k_u_m_e_n_t_0000011 digital - + 000012 _dxHJAQRH9faWPAy2SHGHkK @@ -579,7 +579,7 @@ D_o_k_u_m_e_n_t_0000012 digital - + 000013 _YmgmI3c08DnHMLZNhjyq25 @@ -597,7 +597,7 @@ D_o_k_u_m_e_n_t_0000013 digital - + 000014 _zNm1O32k4ujOrXNyxlTreS @@ -637,7 +637,7 @@ D_o_k_u_m_e_n_t_0000014 digital - + 000015 _BblSEzq4DYiBdsNmhCHVgC @@ -655,7 +655,7 @@ D_o_k_u_m_e_n_t_0000015 digital - + 000016 _qvO2zsx328rTNX4bdzB06o @@ -667,7 +667,7 @@ D_o_k_u_m_e_n_t_0000016 digital - + 000017 _FqoZqKtYsQn9iAdoqqv1cI @@ -691,7 +691,7 @@ D_o_k_u_m_e_n_t_0000017 digital - + 000018 _mknnbMIDwFwyyAhb0nNNAi diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001.jp2 b/internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001.jp2 similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001.jp2 rename to internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001.jp2 diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001_PREMIS.xml b/internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001_PREMIS.xml similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001_PREMIS.xml rename to internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000001_PREMIS.xml diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002.jp2 b/internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002.jp2 similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002.jp2 rename to internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002.jp2 diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002_PREMIS.xml b/internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002_PREMIS.xml similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002_PREMIS.xml rename to internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/00000002_PREMIS.xml diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/Prozess_Digitalisierung_PREMIS.xml b/internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/Prozess_Digitalisierung_PREMIS.xml similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/content/d_0000001/Prozess_Digitalisierung_PREMIS.xml rename to internal/testdata/little-Test-AIP-Digitization/content/content/d_0000001/Prozess_Digitalisierung_PREMIS.xml diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/old/SIP/metadata.xml b/internal/testdata/little-Test-AIP-Digitization/content/header/old/SIP/metadata.xml similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/old/SIP/metadata.xml rename to internal/testdata/little-Test-AIP-Digitization/content/header/old/SIP/metadata.xml diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/ablieferung.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/ablieferung.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/ablieferung.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/ablieferung.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischeNotiz.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischeNotiz.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischeNotiz.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischeNotiz.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischerVorgang.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischerVorgang.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischerVorgang.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/archivischerVorgang.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/arelda.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/arelda.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/arelda.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/arelda.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/base.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/base.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/base.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/base.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/datei.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/datei.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/datei.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/datei.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/dokument.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/dokument.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/dokument.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/dokument.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/dossier.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/dossier.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/dossier.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/dossier.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/ordner.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/ordner.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/ordner.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/ordner.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystem.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystem.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystem.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystem.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystemposition.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystemposition.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystemposition.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/ordnungssystemposition.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/paket.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/paket.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/paket.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/paket.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/provenienz.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/provenienz.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/provenienz.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/provenienz.xsd diff --git a/internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/zusatzDaten.xsd b/internal/testdata/little-Test-AIP-Digitization/content/header/xsd/zusatzDaten.xsd similarity index 100% rename from internal/workflow/testdata/little-Test-AIP-Digitization/content/header/xsd/zusatzDaten.xsd rename to internal/testdata/little-Test-AIP-Digitization/content/header/xsd/zusatzDaten.xsd diff --git a/internal/workflow/preprocessing.go b/internal/workflow/preprocessing.go index a4e98e02..0bcfbf96 100644 --- a/internal/workflow/preprocessing.go +++ b/internal/workflow/preprocessing.go @@ -3,6 +3,7 @@ package workflow import ( "fmt" "path/filepath" + "slices" "strings" "time" @@ -133,6 +134,7 @@ func (w *PreprocessingWorkflow) Execute( // Verify that package contents match the manifest. verifyManifestEvent := newEvent(ctx, "Verify SIP manifest") + verifyChecksumsEvent := newEvent(ctx, "Verify SIP checksums") var verifyManifest activities.VerifyManifestResult e = temporalsdk_workflow.ExecuteActivity( withLocalActOpts(ctx), @@ -148,13 +150,14 @@ func (w *PreprocessingWorkflow) Execute( return systemError(logger, "Verify manifest", &result, e), nil } - if verifyManifest.Failures != nil { + if len(verifyManifest.MissingFiles) > 0 || len(verifyManifest.UnexpectedFiles) > 0 { + failures := slices.Concat(verifyManifest.MissingFiles, verifyManifest.UnexpectedFiles) verifyManifestEvent.Complete( ctx, enums.EventOutcomeValidationFailure, "Content error: SIP contents do not match %q:\n%s", filepath.Base(identifySIP.SIP.ManifestPath), - strings.Join(verifyManifest.Failures, "\n"), + strings.Join(failures, "\n"), ) } else { verifyManifestEvent.Complete( @@ -165,6 +168,22 @@ func (w *PreprocessingWorkflow) Execute( } result.addEvent(verifyManifestEvent) + if len(verifyManifest.ChecksumFailures) > 0 { + verifyChecksumsEvent.Complete( + ctx, + enums.EventOutcomeValidationFailure, + "Content error: SIP checksums do not match file contents:\n%s", + strings.Join(verifyManifest.ChecksumFailures, "\n"), + ) + } else { + verifyChecksumsEvent.Complete( + ctx, + enums.EventOutcomeSuccess, + "SIP checksums match file contents", + ) + } + result.addEvent(verifyChecksumsEvent) + // Validate file formats. validateFileFormatsEvent := newEvent(ctx, "Validate SIP file formats") var validateFileFormats activities.ValidateFileFormatsResult @@ -285,9 +304,6 @@ func (w *PreprocessingWorkflow) Execute( "SIP has been bagged", )) - // TODO: validate checksums located in the XML metadata file - // against the checksums generated on Bag creation. - return &result, nil } diff --git a/internal/workflow/preprocessing_test.go b/internal/workflow/preprocessing_test.go index 714fe39b..161f16e4 100644 --- a/internal/workflow/preprocessing_test.go +++ b/internal/workflow/preprocessing_test.go @@ -370,6 +370,13 @@ func (s *PreprocessingTestSuite) TestPreprocessingWorkflowSuccess() { StartedAt: testTime, CompletedAt: testTime, }, + { + Name: "Verify SIP checksums", + Message: "SIP checksums match file contents", + Outcome: enums.EventOutcomeSuccess, + StartedAt: testTime, + CompletedAt: testTime, + }, { Name: "Validate SIP file formats", Message: "No disallowed file formats found", @@ -480,10 +487,12 @@ func (s *PreprocessingTestSuite) TestPreprocessingWorkflowValidationFails() { &activities.VerifyManifestParams{SIP: expectedSIP}, ).Return( &activities.VerifyManifestResult{ - Failures: []string{ - "Missing file: d_0000001/00000001.jp2", - "Unexpected file: d_0000001/extra_file.txt", + Failed: true, + ChecksumFailures: []string{ + `Checksum mismatch for "content/content/d_0000001/00000001.jp2" (expected: "827ccb0eea8a706c4c34a16891f84e7b", got: "2714364e3a0ac68e8bf9b898b31ff303")`, }, + MissingFiles: []string{"Missing file: d_0000001/00000001.jp2"}, + UnexpectedFiles: []string{"Unexpected file: d_0000001/extra_file.txt"}, }, nil, ) @@ -550,6 +559,14 @@ Unexpected file: d_0000001/extra_file.txt`, StartedAt: testTime, CompletedAt: testTime, }, + { + Name: "Verify SIP checksums", + Message: `Content error: SIP checksums do not match file contents: +Checksum mismatch for "content/content/d_0000001/00000001.jp2" (expected: "827ccb0eea8a706c4c34a16891f84e7b", got: "2714364e3a0ac68e8bf9b898b31ff303")`, + Outcome: enums.EventOutcomeValidationFailure, + StartedAt: testTime, + CompletedAt: testTime, + }, { Name: "Validate SIP file formats", Message: `Content error: file format validation has failed. One or more file formats are not allowed: