Skip to content

Commit

Permalink
Add an activity to merge AIS metadata files
Browse files Browse the repository at this point in the history
Fixes #77.

Concatenate the Arelda metadata file from the original package and the
METS file created by Archivematica into a single "AIS" metadata file.

[skip codecov]
  • Loading branch information
djjuhasz committed Nov 13, 2024
1 parent fa37e42 commit 12acc86
Show file tree
Hide file tree
Showing 6 changed files with 1,039 additions and 0 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/artefactual-sdps/preprocessing-sfa
go 1.23.2

require (
github.com/antchfx/xmlquery v1.4.2
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4
github.com/beevik/etree v1.4.0
github.com/deckarep/golang-set/v2 v2.6.0
Expand All @@ -22,6 +23,7 @@ require (
)

require (
github.com/antchfx/xpath v1.3.2 // indirect
github.com/aws/aws-sdk-go v1.55.5 // indirect
github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect
Expand Down
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ cloud.google.com/go/iam v1.1.13/go.mod h1:K8mY0uSXwEXS30KrnVb+j54LB/ntfZu1dr+4zF
cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs=
cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/antchfx/xmlquery v1.4.2 h1:MZKd9+wblwxfQ1zd1AdrTsqVaMjMCwow3IqkCSe00KA=
github.com/antchfx/xmlquery v1.4.2/go.mod h1:QXhvf5ldTuGqhd1SHNvvtlhhdQLks4dD0awIVhXIDTA=
github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U=
github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 h1:WF95IOkZRVSCST/26SAqPYsUrtUuJpavBht6lvdeKl0=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug=
github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
Expand Down Expand Up @@ -307,6 +311,7 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY=
Expand Down
138 changes: 138 additions & 0 deletions internal/ais/combinemd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
package ais

import (
"context"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"

"github.com/antchfx/xmlquery"

"github.com/artefactual-sdps/preprocessing-sfa/internal/fsutil"
)

const CombineMDActivityName = "combine-metadata-files"

type (
CombineMDActivity struct{}
CombineMDActivityParams struct {
AreldaRelPath string
METSRelPath string
WorkingDir string
}
CombineMDActivityResult struct {
Path string
}
)

func NewCombineMDActivity() *CombineMDActivity {
return &CombineMDActivity{}
}

func (a *CombineMDActivity) Execute(
ctx context.Context,
params CombineMDActivityParams,
) (*CombineMDActivityResult, error) {
areldaPath := filepath.Join(params.WorkingDir, params.AreldaRelPath)
if !fsutil.FileExists(areldaPath) {
return nil, fmt.Errorf("missing Arelda file: %s", areldaPath)
}

metsPath := filepath.Join(params.WorkingDir, params.METSRelPath)
if !fsutil.FileExists(metsPath) {
return nil, fmt.Errorf("missing METS file: %s", metsPath)
}

aisName, err := aisFilename(areldaPath)
if err != nil {
return nil, fmt.Errorf("name AIS file: %v", err)
}

dest := filepath.Join(params.WorkingDir, aisName)

// Combine metadata files into AIS file.
w, err := os.Create(dest) // #nosec G304 -- generated path.
if err != nil {
return nil, fmt.Errorf("create AIS file: %v", err)
}
defer w.Close()

if err := w.Chmod(os.FileMode(0o644)); err != nil {
return nil, fmt.Errorf("set AIS file permissions: %v", err)
}

if err = concat(w, filepath.Join(areldaPath), filepath.Join(metsPath)); err != nil {
return nil, fmt.Errorf("concat: %v", err)
}

// Delete original metadata files.
if err = removePaths(areldaPath, metsPath); err != nil {
return nil, fmt.Errorf("removePaths: %v", err)
}

return &CombineMDActivityResult{Path: dest}, nil
}

func aisFilename(mdpath string) (string, error) {
id, err := parseAccessionID(mdpath)
if err != nil {
return "", fmt.Errorf("get accession number: %v", err)
}

id = strings.ReplaceAll(id, "/", "_")

return fmt.Sprintf("AIS_%s", id), nil
}

func parseAccessionID(path string) (string, error) {
f, err := os.Open(path) // #nosec G304 -- trusted path.
if err != nil {
return "", fmt.Errorf("open metadata file: %v", err)
}
defer f.Close()

sp, err := xmlquery.CreateStreamParser(f, "//paket/ablieferung/ablieferungsnummer")
if err != nil {
return "", fmt.Errorf("create XML parser: %v", err)
}

n, err := sp.Read()
if err == io.EOF {
return "", fmt.Errorf("can't find ablieferungsnummer in %q", filepath.Base(path))
}
if err != nil {
return "", fmt.Errorf("read XML stream: %v", err)
}
return n.InnerText(), nil
}

func concat(w io.Writer, paths ...string) error {
for i := range paths {
r, err := os.Open(paths[i]) // #nosec G304 -- trusted path.
if err != nil {
return fmt.Errorf("read: %v", err)
}
defer r.Close()

if _, err := io.Copy(w, r); err != nil {
return fmt.Errorf("copy: %v", err)
}
_ = r.Close()
}

return nil
}

func removePaths(paths ...string) error {
var err error
for i := range paths {
if e := os.Remove(paths[i]); e != nil {
err = errors.Join(err, fmt.Errorf("remove: %v", e))
}
}

return err
}
128 changes: 128 additions & 0 deletions internal/ais/combinemd_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package ais_test

import (
"strings"
"testing"

temporalsdk_activity "go.temporal.io/sdk/activity"
temporalsdk_testsuite "go.temporal.io/sdk/testsuite"
"gotest.tools/v3/assert"
"gotest.tools/v3/fs"

"github.com/artefactual-sdps/preprocessing-sfa/internal/ais"
)

const (
arelda = `<?xml version="1.0" encoding="UTF-8"?>
<paket xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:xip="http://www.tessella.com/XIP/v4"
xmlns="http://bar.admin.ch/arelda/v4"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:submissionTests="http://bar.admin.ch/submissionTestResult"
xsi:type="paketAIP"
schemaVersion="5.0">
<ablieferung xsi:type="ablieferungFilesAIP">
<ablieferungstyp>FILES</ablieferungstyp>
<ablieferndeStelle>Bundesverwaltung (Bern)</ablieferndeStelle>
<ablieferungsnummer>1000/893_3251903</ablieferungsnummer>
</ablieferung>
</packet>
`

mets = `<?xml version='1.0' encoding='UTF-8'?>
<mets:mets xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mets="http://www.loc.gov/METS/" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version1121/mets.xsd">
</mets:mets>
`
)

func testDir(t *testing.T) string {
td := fs.NewDir(t, "ppsfa",
fs.WithFile("arelda.xml", arelda),
fs.WithFile("mets.xml", mets),
)

return td.Path()
}

func TestExecute(t *testing.T) {
t.Parallel()

tests := []struct {
name string
params ais.CombineMDActivityParams
want ais.CombineMDActivityResult
wantErr string
wantManifest fs.Manifest
}{
{
name: "Returns the combined metadata",
params: ais.CombineMDActivityParams{
AreldaRelPath: "arelda.xml",
METSRelPath: "mets.xml",
WorkingDir: testDir(t),
},
want: ais.CombineMDActivityResult{Path: "{{wd}}/AIS_1000_893_3251903"},
wantManifest: fs.Expected(t,
fs.WithFile("AIS_1000_893_3251903", arelda+mets, fs.WithMode(0o644)),
),
},
{
name: "Errors if the Arelda file doesn't exist",
params: ais.CombineMDActivityParams{
AreldaRelPath: "missing.xml",
WorkingDir: testDir(t),
},
wantErr: "activity error (type: combine-metadata-files, scheduledEventID: 0, startedEventID: 0, identity: ): missing Arelda file: {{wd}}/missing.xml",
},
{
name: "Errors if the METS file doesn't exist",
params: ais.CombineMDActivityParams{
AreldaRelPath: "arelda.xml",
METSRelPath: "missing.xml",
WorkingDir: testDir(t),
},
wantErr: "activity error (type: combine-metadata-files, scheduledEventID: 0, startedEventID: 0, identity: ): missing METS file: {{wd}}/missing.xml",
},
{
name: "Errors when the Arelda file is invalid",
params: ais.CombineMDActivityParams{
AreldaRelPath: "mets.xml",
WorkingDir: testDir(t),
},
wantErr: "activity error (type: combine-metadata-files, scheduledEventID: 0, startedEventID: 0, identity: ): name AIS file: get accession number: can't find ablieferungsnummer in \"mets.xml\"",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

ts := &temporalsdk_testsuite.WorkflowTestSuite{}
env := ts.NewTestActivityEnvironment()
env.RegisterActivityWithOptions(
ais.NewCombineMDActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: ais.CombineMDActivityName},
)

tt.want.Path = strings.ReplaceAll(tt.want.Path, "{{wd}}", tt.params.WorkingDir)
tt.wantErr = strings.ReplaceAll(tt.wantErr, "{{wd}}", tt.params.WorkingDir)

future, err := env.ExecuteActivity(ais.CombineMDActivityName, tt.params)
if tt.wantErr != "" {
if err == nil {
t.Errorf("error is nil, expecting: %q", tt.wantErr)
} else {
assert.ErrorContains(t, err, tt.wantErr)
}

return
}
assert.NilError(t, err)

var got ais.CombineMDActivityResult
future.Get(&got)
assert.DeepEqual(t, got, tt.want)
assert.Assert(t, fs.Equal(tt.params.WorkingDir, tt.wantManifest))
})
}
}
18 changes: 18 additions & 0 deletions internal/ais/workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,20 @@ func (w *Workflow) SessionHandler(ctx temporalsdk_workflow.Context, aipUUID, aip
return "", e
}

var combineMDResult CombineMDActivityResult
e = temporalsdk_workflow.ExecuteActivity(
withFilesystemActivityOpts(ctx),
CombineMDActivityName,
&CombineMDActivityParams{
AreldaRelPath: metadataRelPath,
METSRelPath: metsPath,
WorkingDir: filepath.Join(localDir, filepath.Base(metadataRelPath)),
},
).Get(ctx, &combineMDResult)
if e != nil {
return "", e
}

var zipResult archivezip.Result
e = temporalsdk_workflow.ExecuteActivity(
withFilesystemActivityOpts(ctx),
Expand Down Expand Up @@ -231,6 +245,10 @@ func RegisterWorkflow(ctx context.Context, tw temporalsdk_worker.Worker, config
NewParseActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: ParseActivityName},
)
tw.RegisterActivityWithOptions(
NewCombineMDActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: CombineMDActivityName},
)
tw.RegisterActivityWithOptions(
archivezip.New().Execute,
temporalsdk_activity.RegisterOptions{Name: archivezip.Name},
Expand Down
Loading

0 comments on commit 12acc86

Please sign in to comment.