Skip to content

Commit

Permalink
WIP: Add activity to merge AIS metadata files
Browse files Browse the repository at this point in the history
Fixes #77.
  • Loading branch information
djjuhasz committed Nov 7, 2024
1 parent 7dd47ca commit 573ef7f
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 27 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/artefactual-sdps/preprocessing-sfa
go 1.23.2

require (
github.com/antchfx/xmlquery v1.4.2
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4
github.com/beevik/etree v1.4.0
github.com/deckarep/golang-set/v2 v2.6.0
Expand All @@ -23,6 +24,7 @@ require (
)

require (
github.com/antchfx/xpath v1.3.2 // indirect
github.com/aws/aws-sdk-go v1.55.5 // indirect
github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect
Expand Down
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ cloud.google.com/go/iam v1.1.13/go.mod h1:K8mY0uSXwEXS30KrnVb+j54LB/ntfZu1dr+4zF
cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs=
cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/antchfx/xmlquery v1.4.2 h1:MZKd9+wblwxfQ1zd1AdrTsqVaMjMCwow3IqkCSe00KA=
github.com/antchfx/xmlquery v1.4.2/go.mod h1:QXhvf5ldTuGqhd1SHNvvtlhhdQLks4dD0awIVhXIDTA=
github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U=
github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 h1:WF95IOkZRVSCST/26SAqPYsUrtUuJpavBht6lvdeKl0=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug=
github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
Expand Down Expand Up @@ -307,6 +311,7 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY=
Expand Down
74 changes: 74 additions & 0 deletions internal/ais/mergemd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package ais

import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"strings"

"github.com/antchfx/xmlquery"
)

const MergeMDActivityName = "merge-md-files"

type (
MergeMDActivity struct{}
MergeMDActivityParams struct {
METSRelPath string
MetadataRelPath string
WorkingDir string
}
MergeMDActivityResult struct {
Path string
}
)

func NewMergeMDActivity() *MergeMDActivity {
return &MergeMDActivity{}
}

func (a *MergeMDActivity) Execute(ctx context.Context, params MergeMDActivityParams) (*MergeMDActivityResult, error) {
aisName, err := aisFilename(filepath.Join(params.WorkingDir, params.MetadataRelPath))
if err != nil {
return nil, fmt.Errorf("get AIS filename: %v", err)
}

aisPath := filepath.Join(params.WorkingDir, aisName)

return &MergeMDActivityResult{Path: aisPath}, nil
}

func aisFilename(mdpath string) (string, error) {
id, err := parseAccessionID(mdpath)
if err != nil {
return "", fmt.Errorf("get accession number: %v", err)
}

id = strings.ReplaceAll(id, "/", "_")

return fmt.Sprintf("AIS_%s", id), nil
}

func parseAccessionID(path string) (string, error) {
f, err := os.Open(path) // #nosec G304 -- trusted path
if err != nil {
return "", fmt.Errorf("open metadata file: %v", err)
}
defer f.Close()

sp, err := xmlquery.CreateStreamParser(f, "//paket/ablieferung/ablieferungsnummer")
if err != nil {
return "", fmt.Errorf("create XML parser: %v", err)
}

n, err := sp.Read()
if err == io.EOF {
return "", fmt.Errorf("can't find ablieferungsnummer in %q", filepath.Base(path))
}
if err != nil {
return "", fmt.Errorf("read XML stream: %v", err)
}
return n.InnerText(), nil
}
83 changes: 83 additions & 0 deletions internal/ais/mergemd_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package ais_test

import (
"path/filepath"
"testing"

temporalsdk_activity "go.temporal.io/sdk/activity"
temporalsdk_testsuite "go.temporal.io/sdk/testsuite"
"gotest.tools/v3/assert"

"github.com/artefactual-sdps/preprocessing-sfa/internal/ais"
)

func TestExecute(t *testing.T) {
t.Parallel()

aipPath := "../testdata/little-Test-AIP-Digitization/"

tests := []struct {
name string
params ais.MergeMDActivityParams
want ais.MergeMDActivityResult
wantErr string
}{
{
name: "Returns an AIS metadata file path",
params: ais.MergeMDActivityParams{
METSRelPath: "",
MetadataRelPath: "additional/UpdatedAreldaMetadata.xml",
WorkingDir: aipPath,
},
want: ais.MergeMDActivityResult{
Path: filepath.Join(aipPath, "AIS_1000_893_3251903"),
},
},
{
name: "Errors if metadata file doesn't exist",
params: ais.MergeMDActivityParams{
METSRelPath: "",
MetadataRelPath: "additional/missing.xml",
WorkingDir: aipPath,
},
wantErr: "activity error (type: merge-md-files, scheduledEventID: 0, startedEventID: 0, identity: ): get AIS filename: get accession number: open metadata file: open ../testdata/little-Test-AIP-Digitization/additional/missing.xml: no such file or directory",
},
{
name: "Errors if metadata file is invalid",
params: ais.MergeMDActivityParams{
METSRelPath: "",
MetadataRelPath: "content/content/d_0000001/00000001_PREMIS.xml",
WorkingDir: aipPath,
},
wantErr: "activity error (type: merge-md-files, scheduledEventID: 0, startedEventID: 0, identity: ): get AIS filename: get accession number: can't find ablieferungsnummer in \"00000001_PREMIS.xml\"",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

ts := &temporalsdk_testsuite.WorkflowTestSuite{}
env := ts.NewTestActivityEnvironment()
env.RegisterActivityWithOptions(
ais.NewMergeMDActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: ais.MergeMDActivityName},
)

future, err := env.ExecuteActivity(ais.MergeMDActivityName, tt.params)
if tt.wantErr != "" {
if err == nil {
t.Errorf("error is nil, expecting: %q", tt.wantErr)
} else {
assert.ErrorContains(t, err, tt.wantErr)
}

return
}
assert.NilError(t, err)

var got ais.MergeMDActivityResult
future.Get(&got)
assert.DeepEqual(t, got, tt.want)
})
}
}
43 changes: 16 additions & 27 deletions internal/ais/workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,36 +151,25 @@ func (w *Workflow) SessionHandler(ctx temporalsdk_workflow.Context, params *Work
return "", e
}

var metadataPath string
if parseResult.UpdatedAreldaMetadataRelPath != "" {
var fetchMetadataResult FetchActivityResult
e = temporalsdk_workflow.ExecuteActivity(
withRemoteActOpts(ctx),
FetchActivityName,
&FetchActivityParams{
AIPUUID: params.AIPUUID,
RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, parseResult.UpdatedAreldaMetadataRelPath),
Destination: filepath.Join(localDir, filepath.Base(parseResult.UpdatedAreldaMetadataRelPath)),
},
).Get(ctx, &fetchMetadataResult)
if e != nil {
return "", e
}
metadataPath = parseResult.UpdatedAreldaMetadataRelPath
} else {
metadataPath = parseResult.MetadataRelPath
}

if parseResult.UpdatedAreldaMetadataRelPath == "" && parseResult.MetadataRelPath != "" {
var fetchMetadataResult FetchActivityResult
e = temporalsdk_workflow.ExecuteActivity(
withRemoteActOpts(ctx),
FetchActivityName,
&FetchActivityParams{
AIPUUID: params.AIPUUID,
RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, parseResult.MetadataRelPath),
Destination: filepath.Join(localDir, filepath.Base(parseResult.MetadataRelPath)),
},
).Get(ctx, &fetchMetadataResult)
if e != nil {
return "", e
}
var fetchMetadataResult FetchActivityResult
e = temporalsdk_workflow.ExecuteActivity(
withRemoteActOpts(ctx),
FetchActivityName,
&FetchActivityParams{
AIPUUID: params.AIPUUID,
RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, metadataPath),
Destination: filepath.Join(localDir, filepath.Base(metadataPath)),
},
).Get(ctx, &fetchMetadataResult)
if e != nil {
return "", e
}

var zipResult archivezip.Result
Expand Down

0 comments on commit 573ef7f

Please sign in to comment.