Skip to content

Commit

Permalink
Validate the logical metadata file, fixes #98
Browse files Browse the repository at this point in the history
Add an activity to validate the logical metadata PREMIS XML file against
a local copy of the PREMIS v3 XSD.

Setps;
- Verify that an SFA AIP includes a logical metadata file
- Create a local PREMIS v3 XSD file for validating the logical metadata
  file
- Validate the logical metadata file against the PREMIS v3 XSD
  • Loading branch information
djjuhasz committed Dec 17, 2024
1 parent a682cec commit f664952
Show file tree
Hide file tree
Showing 15 changed files with 1,702 additions and 86 deletions.
4 changes: 4 additions & 0 deletions cmd/worker/workercmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ func (m *Main) Run(ctx context.Context) error {
xmlvalidate.New(xmlvalidate.NewXMLLintValidator()).Execute,
temporalsdk_activity.RegisterOptions{Name: xmlvalidate.Name},
)
w.RegisterActivityWithOptions(
activities.NewValidatePREMIS(xmlvalidate.NewXMLLintValidator()).Execute,
temporalsdk_activity.RegisterOptions{Name: activities.ValidatePREMISName},
)
w.RegisterActivityWithOptions(
activities.NewTransformSIP().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.TransformSIPName},
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ require (
github.com/spf13/pflag v1.0.5
github.com/spf13/viper v1.18.2
github.com/stretchr/testify v1.9.0
github.com/tonglil/buflogr v1.1.1
go.artefactual.dev/tools v0.17.0
go.temporal.io/sdk v1.26.1
go.uber.org/mock v0.4.0
Expand Down
12 changes: 8 additions & 4 deletions internal/activities/identify_sip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ func TestIdentifySIP(t *testing.T) {
t.Parallel()

path := fs.NewDir(t, "",
fs.WithDir("content",
fs.WithDir("d_0000001",
fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", ""),
fs.WithDir("Digitized-AIP",
fs.WithDir("content",
fs.WithDir("d_0000001",
fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", ""),
),
),
fs.WithDir("additional"),
),
fs.WithDir("additional")).Path()
).Join("Digitized-AIP")

tests := []struct {
name string
Expand All @@ -39,6 +42,7 @@ func TestIdentifySIP(t *testing.T) {
Type: enums.SIPTypeDigitizedAIP,
Path: path,
ContentPath: filepath.Join(path, "content", "content"),
LogicalMDPath: filepath.Join(path, "additional", "Digitized-AIP-premis.xml"),
ManifestPath: filepath.Join(path, "additional", "UpdatedAreldaMetadata.xml"),
MetadataPath: filepath.Join(path, "content", "header", "old", "SIP", "metadata.xml"),
UpdatedAreldaMDPath: filepath.Join(path, "additional", "UpdatedAreldaMetadata.xml"),
Expand Down
2 changes: 1 addition & 1 deletion internal/activities/validate_files.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func (a *ValidateFiles) identifyFormats(ctx context.Context, sip sip.SIP) (fileF

ff, err := a.identifier.Identify(path)
if err != nil {
logger.Info("format identication failed", "path", path)
logger.Info("format identification failed", "path", path)
} else {
formats[path] = ff
}
Expand Down
92 changes: 92 additions & 0 deletions internal/activities/validate_premis.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package activities

import (
"context"
"fmt"
"os"
"path/filepath"

"github.com/artefactual-sdps/temporal-activities/xmlvalidate"
"go.artefactual.dev/tools/temporal"

"github.com/artefactual-sdps/preprocessing-sfa/internal/fsutil"
"github.com/artefactual-sdps/preprocessing-sfa/internal/premis"
)

const ValidatePREMISName = "ValidatePREMIS"

type (
ValidatePREMIS struct {
validator xmlvalidate.XSDValidator
xsd string
}

ValidatePREMISParams struct {
// Path of the PREMIS XML file to be validated.
Path string
}

ValidatePREMISResult struct {
Failures []string
}
)

func NewValidatePREMIS(v xmlvalidate.XSDValidator) *ValidatePREMIS {
return &ValidatePREMIS{validator: v}
}

// Execute validates the given PREMIS file against an XSD.
func (a *ValidatePREMIS) Execute(ctx context.Context, params *ValidatePREMISParams) (*ValidatePREMISResult, error) {
var failures []string

logger := temporal.GetLogger(ctx)

if !fsutil.FileExists(params.Path) {
failures = append(
failures,
fmt.Sprintf("file not found: %s", filepath.Base(params.Path)),
)
return &ValidatePREMISResult{Failures: failures}, nil
}

xsd, err := a.xsdPath()
if err != nil {
return nil, fmt.Errorf("get PREMIS XSD path: %v", err)
}

out, err := a.validator.Validate(ctx, params.Path, xsd)
if err != nil {
return nil, fmt.Errorf("validate PREMIS: %v", err)
}
if out != "" {
logger.Info("PREMIS validation failed", "file", params.Path, "output", out)
failures = append(
failures,
fmt.Sprintf("%s does not match expected metadata requirements", filepath.Base(params.Path)),
)
}

return &ValidatePREMISResult{Failures: failures}, nil
}

// xsdPath returns the path to a local PREMIS v3 XSD file, creating the file if
// necessary.
func (a *ValidatePREMIS) xsdPath() (string, error) {
if a.xsd != "" {
return a.xsd, nil
}

f, err := os.CreateTemp("", "premis-v3-*.xsd")
if err != nil {
return "", err
}
defer f.Close()

if _, err := f.Write(premis.XSDv3); err != nil {
return "", err
}

a.xsd = f.Name()

return f.Name(), nil
}
154 changes: 154 additions & 0 deletions internal/activities/validate_premis_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
package activities_test

import (
"bytes"
"context"
"errors"
"testing"

"github.com/artefactual-sdps/temporal-activities/xmlvalidate"
"github.com/tonglil/buflogr"
"go.artefactual.dev/tools/temporal"
temporalsdk_activity "go.temporal.io/sdk/activity"
temporalsdk_interceptor "go.temporal.io/sdk/interceptor"
temporalsdk_testsuite "go.temporal.io/sdk/testsuite"
temporalsdk_worker "go.temporal.io/sdk/worker"
"gotest.tools/v3/assert"
"gotest.tools/v3/fs"

"github.com/artefactual-sdps/preprocessing-sfa/internal/activities"
)

var premisXML = `<?xml version="1.0" encoding="UTF-8"?>
<premis:premis xmlns:premis="http://www.loc.gov/premis/v3" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/premis/v3 https://www.loc.gov/standards/premis/premis.xsd" version="3.0">
<premis:object xsi:type="premis:file">
<premis:objectIdentifier>
<premis:objectIdentifierType>uuid</premis:objectIdentifierType>
<premis:objectIdentifierValue>c74a85b7-919b-409e-8209-9c7ebe0e7945</premis:objectIdentifierValue>
</premis:objectIdentifier>
<premis:objectCharacteristics>
<premis:format>
<premis:formatDesignation>
<premis:formatName/>
</premis:formatDesignation>
</premis:format>
</premis:objectCharacteristics>
<premis:originalName>data/objects/test_transfer/content/cat.jpg</premis:originalName>
</premis:object>
</premis:premis>
`

type fakeValidator struct {
Msg string
Err error
}

func (v *fakeValidator) Validate(ctx context.Context, xmlPath, xsdPath string) (string, error) {
return v.Msg, v.Err
}

func (v *fakeValidator) WithMsg(msg string) *fakeValidator {
v.Msg = msg
return v
}

func (v *fakeValidator) WithErr(err error) *fakeValidator {
v.Err = err
return v
}

func newFakeValidator() *fakeValidator {
return &fakeValidator{}
}

func TestValidatePREMIS(t *testing.T) {
t.Parallel()

tests := []struct {
name string
validator xmlvalidate.XSDValidator
params activities.ValidatePREMISParams
want activities.ValidatePREMISResult
wantErr string
}{
{
name: "Validates a PREMIS file",
validator: xmlvalidate.NewXMLLintValidator(),
params: activities.ValidatePREMISParams{
Path: fs.NewDir(t, "enduro-test",
fs.WithFile("premis.xml", premisXML),
).Join("premis.xml"),
},
},
{
name: "Returns a validation failure",
validator: newFakeValidator().WithMsg("premis.xml:12: parser error"),
params: activities.ValidatePREMISParams{
Path: fs.NewDir(t, "enduro-test",
fs.WithFile("premis.xml", premisXML),
).Join("premis.xml"),
},
want: activities.ValidatePREMISResult{
Failures: []string{"premis.xml does not match expected metadata requirements"},
},
},
{
name: "Returns a file not found failure",
validator: newFakeValidator().WithErr(errors.New("file not found")),
params: activities.ValidatePREMISParams{
Path: fs.NewDir(t, "enduro-test").Join("premis.xml"),
},
want: activities.ValidatePREMISResult{
Failures: []string{"file not found: premis.xml"},
},
},
{
name: "Returns a system error",
validator: newFakeValidator().WithErr(errors.New("permission denied: premis.xml")),
params: activities.ValidatePREMISParams{
Path: fs.NewDir(t, "enduro-test",
fs.WithFile("premis.xml", premisXML),
).Join("premis.xml"),
},
wantErr: "permission denied: premis.xml",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

var logbuf bytes.Buffer
logger := buflogr.NewWithBuffer(&logbuf)

ts := &temporalsdk_testsuite.WorkflowTestSuite{}
env := ts.NewTestActivityEnvironment()
env.SetWorkerOptions(temporalsdk_worker.Options{
Interceptors: []temporalsdk_interceptor.WorkerInterceptor{
temporal.NewLoggerInterceptor(logger),
},
})
env.RegisterActivityWithOptions(
activities.NewValidatePREMIS(tt.validator).Execute,
temporalsdk_activity.RegisterOptions{Name: activities.ValidatePREMISName},
)

enc, err := env.ExecuteActivity(activities.ValidatePREMISName, tt.params)
if tt.wantErr != "" {
if err == nil {
t.Errorf("error is nil, expecting: %q", tt.wantErr)
} else {
assert.ErrorContains(t, err, tt.wantErr)
}

return
}

t.Log(logbuf.String()) // Echo log for debugging.
assert.NilError(t, err)

var result activities.ValidatePREMISResult
_ = enc.Get(&result)
assert.DeepEqual(t, result, tt.want)
})
}
}
31 changes: 24 additions & 7 deletions internal/activities/validate_structure.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,14 @@ func (a *ValidateStructure) Execute(
) (*ValidateStructureResult, error) {
var failures []string

// Check existence of content and XSD folders.
// Check existence of the content directory.
hasContentDir := true
if !fsutil.FileExists(params.SIP.ContentPath) {
failures = append(failures, "Content folder is missing")
hasContentDir = false
}

// Check existence of the XSD directory.
if !fsutil.FileExists(params.SIP.XSDPath) {
failures = append(failures, "XSD folder is missing")
}
Expand All @@ -51,15 +53,20 @@ func (a *ValidateStructure) Execute(
))
}

// Check existence of UpdatedAreldaMetadata file (digitized AIP only).
if params.SIP.Type == enums.SIPTypeDigitizedAIP && !fsutil.FileExists(params.SIP.UpdatedAreldaMDPath) {
// Check existence of UpdatedAreldaMetadata file (AIPs only).
if params.SIP.IsAIP() && !fsutil.FileExists(params.SIP.UpdatedAreldaMDPath) {
failures = append(failures, fmt.Sprintf(
"%s is missing", filepath.Base(params.SIP.UpdatedAreldaMDPath),
))
}

sipBase := params.SIP.Path
// Check existence of logical metadata file (AIPs only).
if params.SIP.IsAIP() && !fsutil.FileExists(params.SIP.LogicalMDPath) {
failures = append(failures, fmt.Sprintf("%s is missing", filepath.Base(params.SIP.LogicalMDPath)))
}

// Check for unexpected top-level directories.
sipBase := params.SIP.Path
extras, err := extraNodes(sipBase, params.SIP.Path, params.SIP.TopLevelPaths, true)
if err != nil {
return nil, fmt.Errorf("ValidateStructure: check for unexpected dirs: %v", err)
Expand All @@ -75,14 +82,24 @@ func (a *ValidateStructure) Execute(
failures = append(failures, extras...)
}

// Check that digitized SIPs only have one dossier in the content dir.
if params.SIP.Type == enums.SIPTypeDigitizedSIP {
// Check that digitized packages only have one dossier in the content dir.
if params.SIP.Type == enums.SIPTypeDigitizedSIP || params.SIP.Type == enums.SIPTypeDigitizedAIP && hasContentDir {
entries, err := os.ReadDir(params.SIP.ContentPath)
if err != nil {
return nil, fmt.Errorf("ValidateStructure: check for unexpected dossiers: %v", err)
}

if len(entries) > 1 {
dirs := 0
for _, e := range entries {
if e.IsDir() {
dirs += 1
}
if dirs > 1 {
break
}
}

if dirs > 1 {
failures = append(failures, "More than one dossier in the content directory")
}
}
Expand Down
Loading

0 comments on commit f664952

Please sign in to comment.