From 573ef7fec34b66aa26513615c12c759ca8cfd253 Mon Sep 17 00:00:00 2001 From: David Juhasz Date: Thu, 7 Nov 2024 15:42:11 -0800 Subject: [PATCH] WIP: Add activity to merge AIS metadata files Fixes #77. --- go.mod | 2 + go.sum | 5 +++ internal/ais/mergemd.go | 74 ++++++++++++++++++++++++++++++++ internal/ais/mergemd_test.go | 83 ++++++++++++++++++++++++++++++++++++ internal/ais/workflow.go | 43 +++++++------------ 5 files changed, 180 insertions(+), 27 deletions(-) create mode 100644 internal/ais/mergemd.go create mode 100644 internal/ais/mergemd_test.go diff --git a/go.mod b/go.mod index 2e6e29ba..41b8bf5a 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/artefactual-sdps/preprocessing-sfa go 1.23.2 require ( + github.com/antchfx/xmlquery v1.4.2 github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 github.com/beevik/etree v1.4.0 github.com/deckarep/golang-set/v2 v2.6.0 @@ -23,6 +24,7 @@ require ( ) require ( + github.com/antchfx/xpath v1.3.2 // indirect github.com/aws/aws-sdk-go v1.55.5 // indirect github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect diff --git a/go.sum b/go.sum index 7ab41d85..7f102ac9 100644 --- a/go.sum +++ b/go.sum @@ -13,6 +13,10 @@ cloud.google.com/go/iam v1.1.13/go.mod h1:K8mY0uSXwEXS30KrnVb+j54LB/ntfZu1dr+4zF cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs= cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/antchfx/xmlquery v1.4.2 h1:MZKd9+wblwxfQ1zd1AdrTsqVaMjMCwow3IqkCSe00KA= +github.com/antchfx/xmlquery v1.4.2/go.mod h1:QXhvf5ldTuGqhd1SHNvvtlhhdQLks4dD0awIVhXIDTA= +github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U= +github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 h1:WF95IOkZRVSCST/26SAqPYsUrtUuJpavBht6lvdeKl0= github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug= github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= @@ -307,6 +311,7 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= diff --git a/internal/ais/mergemd.go b/internal/ais/mergemd.go new file mode 100644 index 00000000..0b01a353 --- /dev/null +++ b/internal/ais/mergemd.go @@ -0,0 +1,74 @@ +package ais + +import ( + "context" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/antchfx/xmlquery" +) + +const MergeMDActivityName = "merge-md-files" + +type ( + MergeMDActivity struct{} + MergeMDActivityParams struct { + METSRelPath string + MetadataRelPath string + WorkingDir string + } + MergeMDActivityResult struct { + Path string + } +) + +func NewMergeMDActivity() *MergeMDActivity { + return &MergeMDActivity{} +} + +func (a *MergeMDActivity) Execute(ctx context.Context, params MergeMDActivityParams) (*MergeMDActivityResult, error) { + aisName, err := aisFilename(filepath.Join(params.WorkingDir, params.MetadataRelPath)) + if err != nil { + return nil, fmt.Errorf("get AIS filename: %v", err) + } + + aisPath := filepath.Join(params.WorkingDir, aisName) + + return &MergeMDActivityResult{Path: aisPath}, nil +} + +func aisFilename(mdpath string) (string, error) { + id, err := parseAccessionID(mdpath) + if err != nil { + return "", fmt.Errorf("get accession number: %v", err) + } + + id = strings.ReplaceAll(id, "/", "_") + + return fmt.Sprintf("AIS_%s", id), nil +} + +func parseAccessionID(path string) (string, error) { + f, err := os.Open(path) // #nosec G304 -- trusted path + if err != nil { + return "", fmt.Errorf("open metadata file: %v", err) + } + defer f.Close() + + sp, err := xmlquery.CreateStreamParser(f, "//paket/ablieferung/ablieferungsnummer") + if err != nil { + return "", fmt.Errorf("create XML parser: %v", err) + } + + n, err := sp.Read() + if err == io.EOF { + return "", fmt.Errorf("can't find ablieferungsnummer in %q", filepath.Base(path)) + } + if err != nil { + return "", fmt.Errorf("read XML stream: %v", err) + } + return n.InnerText(), nil +} diff --git a/internal/ais/mergemd_test.go b/internal/ais/mergemd_test.go new file mode 100644 index 00000000..b3b2ac77 --- /dev/null +++ b/internal/ais/mergemd_test.go @@ -0,0 +1,83 @@ +package ais_test + +import ( + "path/filepath" + "testing" + + temporalsdk_activity "go.temporal.io/sdk/activity" + temporalsdk_testsuite "go.temporal.io/sdk/testsuite" + "gotest.tools/v3/assert" + + "github.com/artefactual-sdps/preprocessing-sfa/internal/ais" +) + +func TestExecute(t *testing.T) { + t.Parallel() + + aipPath := "../testdata/little-Test-AIP-Digitization/" + + tests := []struct { + name string + params ais.MergeMDActivityParams + want ais.MergeMDActivityResult + wantErr string + }{ + { + name: "Returns an AIS metadata file path", + params: ais.MergeMDActivityParams{ + METSRelPath: "", + MetadataRelPath: "additional/UpdatedAreldaMetadata.xml", + WorkingDir: aipPath, + }, + want: ais.MergeMDActivityResult{ + Path: filepath.Join(aipPath, "AIS_1000_893_3251903"), + }, + }, + { + name: "Errors if metadata file doesn't exist", + params: ais.MergeMDActivityParams{ + METSRelPath: "", + MetadataRelPath: "additional/missing.xml", + WorkingDir: aipPath, + }, + wantErr: "activity error (type: merge-md-files, scheduledEventID: 0, startedEventID: 0, identity: ): get AIS filename: get accession number: open metadata file: open ../testdata/little-Test-AIP-Digitization/additional/missing.xml: no such file or directory", + }, + { + name: "Errors if metadata file is invalid", + params: ais.MergeMDActivityParams{ + METSRelPath: "", + MetadataRelPath: "content/content/d_0000001/00000001_PREMIS.xml", + WorkingDir: aipPath, + }, + wantErr: "activity error (type: merge-md-files, scheduledEventID: 0, startedEventID: 0, identity: ): get AIS filename: get accession number: can't find ablieferungsnummer in \"00000001_PREMIS.xml\"", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + ts := &temporalsdk_testsuite.WorkflowTestSuite{} + env := ts.NewTestActivityEnvironment() + env.RegisterActivityWithOptions( + ais.NewMergeMDActivity().Execute, + temporalsdk_activity.RegisterOptions{Name: ais.MergeMDActivityName}, + ) + + future, err := env.ExecuteActivity(ais.MergeMDActivityName, tt.params) + if tt.wantErr != "" { + if err == nil { + t.Errorf("error is nil, expecting: %q", tt.wantErr) + } else { + assert.ErrorContains(t, err, tt.wantErr) + } + + return + } + assert.NilError(t, err) + + var got ais.MergeMDActivityResult + future.Get(&got) + assert.DeepEqual(t, got, tt.want) + }) + } +} diff --git a/internal/ais/workflow.go b/internal/ais/workflow.go index 351a2eed..a3b5ab15 100644 --- a/internal/ais/workflow.go +++ b/internal/ais/workflow.go @@ -151,36 +151,25 @@ func (w *Workflow) SessionHandler(ctx temporalsdk_workflow.Context, params *Work return "", e } + var metadataPath string if parseResult.UpdatedAreldaMetadataRelPath != "" { - var fetchMetadataResult FetchActivityResult - e = temporalsdk_workflow.ExecuteActivity( - withRemoteActOpts(ctx), - FetchActivityName, - &FetchActivityParams{ - AIPUUID: params.AIPUUID, - RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, parseResult.UpdatedAreldaMetadataRelPath), - Destination: filepath.Join(localDir, filepath.Base(parseResult.UpdatedAreldaMetadataRelPath)), - }, - ).Get(ctx, &fetchMetadataResult) - if e != nil { - return "", e - } + metadataPath = parseResult.UpdatedAreldaMetadataRelPath + } else { + metadataPath = parseResult.MetadataRelPath } - if parseResult.UpdatedAreldaMetadataRelPath == "" && parseResult.MetadataRelPath != "" { - var fetchMetadataResult FetchActivityResult - e = temporalsdk_workflow.ExecuteActivity( - withRemoteActOpts(ctx), - FetchActivityName, - &FetchActivityParams{ - AIPUUID: params.AIPUUID, - RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, parseResult.MetadataRelPath), - Destination: filepath.Join(localDir, filepath.Base(parseResult.MetadataRelPath)), - }, - ).Get(ctx, &fetchMetadataResult) - if e != nil { - return "", e - } + var fetchMetadataResult FetchActivityResult + e = temporalsdk_workflow.ExecuteActivity( + withRemoteActOpts(ctx), + FetchActivityName, + &FetchActivityParams{ + AIPUUID: params.AIPUUID, + RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, metadataPath), + Destination: filepath.Join(localDir, filepath.Base(metadataPath)), + }, + ).Get(ctx, &fetchMetadataResult) + if e != nil { + return "", e } var zipResult archivezip.Result