Skip to content

Commit

Permalink
Add checksum verification to "verify manifest"
Browse files Browse the repository at this point in the history
Fixes #40.

- Switch to `manifest.Files()` to parse SFA manifest files more
  efficiently
- Add checksum verification to the "verify manifest" activity
- Move "internal/workflow/testdata/little-Test-AIP-Digitization" to
  "internal/testdata/little-Test-AIP-Digitization"
- Update "tesdata/little-Test-AIP-Digitization" file hashes to match
  file contents
- Add a verify checksums event to the preprocessing workflow
  • Loading branch information
djjuhasz committed Sep 6, 2024
1 parent d8bd32d commit 0fbc5fb
Show file tree
Hide file tree
Showing 27 changed files with 270 additions and 156 deletions.
2 changes: 0 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ module github.com/artefactual-sdps/preprocessing-sfa
go 1.22.6

require (
github.com/antchfx/xmlquery v1.4.1
github.com/artefactual-sdps/temporal-activities v0.0.0-20240821162351-47302711bc7b
github.com/beevik/etree v1.4.0
github.com/deckarep/golang-set/v2 v2.6.0
Expand All @@ -23,7 +22,6 @@ require (
)

require (
github.com/antchfx/xpath v1.3.1 // indirect
github.com/aws/aws-sdk-go v1.55.5 // indirect
github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect
Expand Down
5 changes: 0 additions & 5 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ cloud.google.com/go/iam v1.1.13/go.mod h1:K8mY0uSXwEXS30KrnVb+j54LB/ntfZu1dr+4zF
cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs=
cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/antchfx/xmlquery v1.4.1 h1:YgpSwbeWvLp557YFTi8E3z6t6/hYjmFEtiEKbDfEbl0=
github.com/antchfx/xmlquery v1.4.1/go.mod h1:lKezcT8ELGt8kW5L+ckFMTbgdR61/odpPgDv8Gvi1fI=
github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk=
github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/artefactual-sdps/temporal-activities v0.0.0-20240821162351-47302711bc7b h1:kTOc2pbkdII6/Z84Bus1q52z5KAOaT8vLpfRoOs1l1I=
github.com/artefactual-sdps/temporal-activities v0.0.0-20240821162351-47302711bc7b/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug=
github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
Expand Down Expand Up @@ -309,7 +305,6 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY=
Expand Down
168 changes: 119 additions & 49 deletions internal/activities/verify_manifest.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@ package activities

import (
"context"
"crypto/md5" // #nosec: 501 -- not used for security.
"encoding/hex"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"
"slices"

"github.com/antchfx/xmlquery"
goset "github.com/deckarep/golang-set/v2"

"github.com/artefactual-sdps/preprocessing-sfa/internal/enums"
"github.com/artefactual-sdps/preprocessing-sfa/internal/manifest"
"github.com/artefactual-sdps/preprocessing-sfa/internal/sip"
)

Expand All @@ -23,7 +26,10 @@ type (
SIP sip.SIP
}
VerifyManifestResult struct {
Failures []string
Failed bool
ChecksumFailures []string
MissingFiles []string
UnexpectedFiles []string
}
)

Expand All @@ -39,76 +45,56 @@ func (a *VerifyManifest) Execute(ctx context.Context, params *VerifyManifestPara
if err != nil {
return nil, fmt.Errorf("verify manifest: parse manifest: %v", err)
}
manifestSet := goset.NewSetFromMapKeys(manifestFiles)

sipFiles, err := sipFiles(params.SIP)
if err != nil {
return nil, fmt.Errorf("verify manifest: get SIP contents: %v", err)
}

var failures []string

if s := manifestFiles.Difference(sipFiles).ToSlice(); len(s) > 0 {
slices.Sort(s)
for _, p := range s {
failures = append(failures, fmt.Sprintf("Missing file: %s", p))
}
badChecksums, err := verifyChecksums(manifestFiles, sipFiles, params.SIP.Path)
if err != nil {
return nil, fmt.Errorf("verify checksums: %v", err)

Check warning on line 57 in internal/activities/verify_manifest.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/verify_manifest.go#L57

Added line #L57 was not covered by tests
}

if s := sipFiles.Difference(manifestFiles).ToSlice(); len(s) > 0 {
slices.Sort(s)
for _, p := range s {
failures = append(failures, fmt.Sprintf("Unexpected file: %s", p))
}
}
missing := missingFiles(manifestSet, sipFiles)
unexpected := unexpectedFiles(manifestSet, sipFiles)

return &VerifyManifestResult{Failures: failures}, nil
return &VerifyManifestResult{
Failed: len(missing) > 0 || len(unexpected) > 0 || len(badChecksums) > 0,
ChecksumFailures: badChecksums,
MissingFiles: missing,
UnexpectedFiles: unexpected,
}, nil
}

// manifestFiles returns the set of all files paths listed in a SIP's manifest.
func manifestFiles(s sip.SIP) (goset.Set[string], error) {
// manifestFiles parses the SIP manifest and returns a map of file paths
// (relative to the SIP root directory) to file checksums.
func manifestFiles(s sip.SIP) (map[string]*manifest.Checksum, error) {
f, err := os.Open(s.ManifestPath)
if err != nil {
return nil, fmt.Errorf("open: %v", err)
}

doc, err := xmlquery.Parse(f)
files, err := manifest.Files(f)
if err != nil {
return nil, fmt.Errorf("parse document: %v", err)
}

manifest, err := xmlquery.Query(doc, "//paket/inhaltsverzeichnis")
if err != nil || manifest == nil {
return nil, fmt.Errorf("missing inhaltsverzeichnis entry: %v", err)
return nil, err

Check warning on line 81 in internal/activities/verify_manifest.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/verify_manifest.go#L81

Added line #L81 was not covered by tests
}

root := ""
// Prefix "content/" to digitized AIP file paths.
if s.Type == enums.SIPTypeDigitizedAIP {
root = "content"
}

return walkNode(manifest, root), nil
}

// walkNode recursively walks node's xpath tree and returns the set of all file
// (excluding directories) paths found.
func walkNode(node *xmlquery.Node, path string) goset.Set[string] {
paths := goset.NewSet[string]()

for _, n := range node.SelectElements("ordner") {
name := n.SelectElement("name").InnerText()
paths = paths.Union(walkNode(n, filepath.Join(path, name)))
}

for _, n := range node.SelectElements("datei") {
name := n.SelectElement("name").InnerText()
paths.Add(filepath.Join(path, name))
m := make(map[string]*manifest.Checksum, len(files))
for k, v := range files {
m[filepath.Join("content", k)] = v
}
files = m
}

return paths
return files, nil
}

// sipFiles recursively walks dir's tree and returns the set of all file
// (excluding directories) paths found.
// (excluding directory) paths found.
func sipFiles(s sip.SIP) (goset.Set[string], error) {
root := s.Path
if s.Type == enums.SIPTypeDigitizedAIP {
Expand All @@ -130,8 +116,8 @@ func sipFiles(s sip.SIP) (goset.Set[string], error) {
return err
}

// Digitized SIP and born-digital SIPs don't include metadata.xml in the
// manifest, so ignore the file here.
// Digitized SIPs and born-digital SIPs don't include metadata.xml in
// the manifest, so ignore the file here.
if s.Type != enums.SIPTypeDigitizedAIP && p == "header/metadata.xml" {
return nil
}
Expand All @@ -146,3 +132,87 @@ func sipFiles(s sip.SIP) (goset.Set[string], error) {

return paths, nil
}

// missingFiles returns the list of all files that are in manifest but not
// filesys.
func missingFiles(manifest, filesys goset.Set[string]) []string {
var missing []string
if s := manifest.Difference(filesys).ToSlice(); len(s) > 0 {
slices.Sort(s)
for _, p := range s {
missing = append(missing, fmt.Sprintf("Missing file: %s", p))
}
}
return missing
}

// unexpectedFiles returns the list of all files that are in filesys but not
// manifest.
func unexpectedFiles(manifest, filesys goset.Set[string]) []string {
var unexpected []string
if s := filesys.Difference(manifest).ToSlice(); len(s) > 0 {
slices.Sort(s)
for _, p := range s {
unexpected = append(unexpected, fmt.Sprintf("Unexpected file: %s", p))
}
}
return unexpected
}

// verifyChecksums checks that each manifestFiles file checksum matches the
// checksum generated from the actual file contents. If a file is on the
// manifest but missing from the filesystem, or vice versa, it will be skipped
// with no validation message. The root is the absolute path to the root
// directory of the SIP, and is prefixed to each relative file path in the
// manifest to create an absolute path the file.
func verifyChecksums(
manifestFiles map[string]*manifest.Checksum,
sipFiles goset.Set[string],
root string,
) ([]string, error) {
var failures []string

for path, checksum := range manifestFiles {
// Check if file exists on filesystem.
if !sipFiles.Contains(path) {
continue
}

// Generate checksum from filesystem file contents.
switch checksum.Algorithm {
case "MD5":
hash, err := md5Hash(filepath.Join(root, path))
if err != nil {
return nil, fmt.Errorf("generate MD5 hash: %v", err)

Check warning on line 186 in internal/activities/verify_manifest.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/verify_manifest.go#L186

Added line #L186 was not covered by tests
}
if hash != checksum.Hash {
failures = append(
failures,
fmt.Sprintf("Checksum mismatch for %q (expected: %q, got: %q)", path, checksum.Hash, hash),
)
}
default:
return nil, fmt.Errorf("hash algorithm %q is not supported", checksum.Algorithm)

Check warning on line 195 in internal/activities/verify_manifest.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/verify_manifest.go#L194-L195

Added lines #L194 - L195 were not covered by tests
}
}
slices.Sort(failures)

return failures, nil
}

// md5Hash returns a hexadecimal encoded hash string generated from the contents
// of the file at path.
func md5Hash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", fmt.Errorf("open file: %v", err)

Check warning on line 208 in internal/activities/verify_manifest.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/verify_manifest.go#L208

Added line #L208 was not covered by tests
}
defer f.Close()

h := md5.New() // #nosec: G401 -- not used for security.
if _, err := io.Copy(h, f); err != nil {
return "", fmt.Errorf("copy contents: %v", err)

Check warning on line 214 in internal/activities/verify_manifest.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/verify_manifest.go#L214

Added line #L214 was not covered by tests
}

return hex.EncodeToString(h.Sum(nil)), nil
}
Loading

0 comments on commit 0fbc5fb

Please sign in to comment.