From 458d34b07f53ebd9e4f58633602411abdc4783d8 Mon Sep 17 00:00:00 2001 From: David Juhasz Date: Wed, 20 Nov 2024 17:00:29 -0800 Subject: [PATCH] Add veraPDF validation of PDF/A files - Add verapdf and the JRE to the worker Docker image - Add a validate_files activity to identify SIP file formats then validate the file formats for which we have a validator - Copy siegfried_embed and the format Identifier interface from https://github.com/artefactual-sdps/temporal-activities - Add the fvalidate package and Validator interface - Add a veraPDF implementation of the Validator interface - Run veraPDF in "batch" mode to minimize startup overheads - Add file validation configuration to config file - Add processing events for file validation success and failure - Add veraPDF binary path to kube dev overlay --- Dockerfile | 13 + Makefile | 9 + cmd/worker/workercmd/cmd.go | 9 + go.mod | 3 +- go.sum | 2 + .../overlays/dev/preprocessing-secret.yaml | 3 + hack/make/dep_mockgen.mk | 16 + internal/activities/validate_files.go | 129 ++++++ internal/activities/validate_files_test.go | 222 ++++++++++ internal/config/config.go | 12 +- internal/fformat/fake/mock_identifier.go | 117 ++++++ internal/fformat/fake/mock_validator.go | 154 +++++++ internal/fformat/fformat.go | 21 + internal/fformat/siegfried_embed.go | 75 ++++ internal/fformat/siegfried_embed_test.go | 48 +++ internal/fvalidate/config.go | 9 + internal/fvalidate/fake/mock_validator.go | 154 +++++++ internal/fvalidate/fvalidate.go | 13 + internal/fvalidate/verapdf_validator.go | 84 ++++ internal/fvalidate/verapdf_validator_test.go | 96 +++++ internal/workflow/preprocessing.go | 33 +- internal/workflow/preprocessing_test.go | 44 +- verapdf_report.json | 385 ++++++++++++++++++ 23 files changed, 1640 insertions(+), 11 deletions(-) create mode 100644 hack/make/dep_mockgen.mk create mode 100644 internal/activities/validate_files.go create mode 100644 internal/activities/validate_files_test.go create mode 100644 internal/fformat/fake/mock_identifier.go create mode 100644 internal/fformat/fake/mock_validator.go create mode 100644 internal/fformat/fformat.go create mode 100644 internal/fformat/siegfried_embed.go create mode 100644 internal/fformat/siegfried_embed_test.go create mode 100644 internal/fvalidate/config.go create mode 100644 internal/fvalidate/fake/mock_validator.go create mode 100644 internal/fvalidate/fvalidate.go create mode 100644 internal/fvalidate/verapdf_validator.go create mode 100644 internal/fvalidate/verapdf_validator_test.go create mode 100644 verapdf_report.json diff --git a/Dockerfile b/Dockerfile index f3ed11fe..a52d8385 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,16 +22,29 @@ RUN --mount=type=cache,target=/go/pkg/mod \ -o /out/preprocessing-worker \ ./cmd/worker +# Build worker image FROM alpine:3.20 AS preprocessing-worker RUN apk add --update --no-cache libxml2-utils +# Copy the JRE (Eclipse Temurin v11) from the verapdf/cli image +ENV JAVA_HOME=/opt/java/openjdk +ENV PATH="${JAVA_HOME}/bin:${PATH}" +COPY --from=ghcr.io/verapdf/cli:v1.27.96 --link $JAVA_HOME $JAVA_HOME + ARG USER_ID=1000 ARG GROUP_ID=1000 RUN addgroup -g ${GROUP_ID} -S preprocessing RUN adduser -u ${USER_ID} -S -D preprocessing preprocessing +# Make preprocessing the owner of the verapdf log dir +RUN mkdir --parents /var/opt/verapdf/logs && chown -R preprocessing:preprocessing /var/opt/verapdf + USER preprocessing + COPY --from=build-preprocessing-worker --link /out/preprocessing-worker /home/preprocessing/bin/preprocessing-worker RUN mkdir /home/preprocessing/shared +# Copy the veraPDF application (v1.26.2) from the verapdf/cli image +COPY --from=ghcr.io/verapdf/cli:v1.27.96 --link /opt/verapdf/ /opt/verapdf/ + CMD ["/home/preprocessing/bin/preprocessing-worker"] diff --git a/Makefile b/Makefile index 806943ea..d32f8569 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,7 @@ include hack/make/dep_golines.mk include hack/make/dep_gomajor.mk include hack/make/dep_gosec.mk include hack/make/dep_gotestsum.mk +include hack/make/dep_mockgen.mk include hack/make/dep_shfmt.mk include hack/make/dep_tparse.mk include hack/make/enums.mk @@ -30,6 +31,7 @@ TOOLS = $(GOLANGCI_LINT) \ $(GOMAJOR) \ $(GOSEC) \ $(GOTESTSUM) \ + $(MOCKGEN) \ $(SHFMT) \ $(TPARSE) @@ -40,7 +42,9 @@ endef IGNORED_PACKAGES := \ github.com/artefactual-sdps/preprocessing-sfa/hack/% \ + github.com/artefactual-sdps/preprocessing-sfa/internal/%/fake \ github.com/artefactual-sdps/preprocessing-sfa/internal/enums + PACKAGES := $(shell go list ./...) TEST_PACKAGES := $(filter-out $(IGNORED_PACKAGES),$(PACKAGES)) TEST_IGNORED_PACKAGES := $(filter $(IGNORED_PACKAGES),$(PACKAGES)) @@ -55,6 +59,11 @@ deps: # @HELP List available module dependency updates. deps: $(GOMAJOR) gomajor list +gen-mock: # @HELP Generate mocks. +gen-mock: $(MOCKGEN) + mockgen -typed -destination=./internal/fformat/fake/mock_identifier.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fformat Identifier + mockgen -typed -destination=./internal/fvalidate/fake/mock_validator.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate Validator + golines: # @HELP Run the golines formatter to fix long lines. golines: GOLINES_OUT_MODE ?= write-output golines: $(GOLINES) diff --git a/cmd/worker/workercmd/cmd.go b/cmd/worker/workercmd/cmd.go index f7f02273..908d3a7b 100644 --- a/cmd/worker/workercmd/cmd.go +++ b/cmd/worker/workercmd/cmd.go @@ -17,6 +17,8 @@ import ( "github.com/artefactual-sdps/preprocessing-sfa/internal/activities" "github.com/artefactual-sdps/preprocessing-sfa/internal/config" + "github.com/artefactual-sdps/preprocessing-sfa/internal/fformat" + "github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate" "github.com/artefactual-sdps/preprocessing-sfa/internal/workflow" ) @@ -78,6 +80,13 @@ func (m *Main) Run(ctx context.Context) error { ffvalidate.New(m.cfg.FileFormat).Execute, temporalsdk_activity.RegisterOptions{Name: ffvalidate.Name}, ) + w.RegisterActivityWithOptions( + activities.NewValidateFiles( + fformat.NewSiegfriedEmbed(), + fvalidate.NewVeraPDFValidator(m.cfg.FileValidate.VeraPDF.Path, m.logger), + ).Execute, + temporalsdk_activity.RegisterOptions{Name: activities.ValidateFilesName}, + ) w.RegisterActivityWithOptions( activities.NewAddPREMISObjects(rand.Reader).Execute, temporalsdk_activity.RegisterOptions{Name: activities.AddPREMISObjectsName}, diff --git a/go.mod b/go.mod index c2f2833c..974b4cff 100644 --- a/go.mod +++ b/go.mod @@ -11,11 +11,13 @@ require ( github.com/google/uuid v1.6.0 github.com/hashicorp/go-cleanhttp v0.5.2 github.com/oklog/run v1.1.0 + github.com/richardlehane/siegfried v1.11.1 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.18.2 github.com/stretchr/testify v1.9.0 go.artefactual.dev/tools v0.14.0 go.temporal.io/sdk v1.26.1 + go.uber.org/mock v0.4.0 gocloud.dev v0.39.0 gotest.tools/v3 v3.5.1 ) @@ -67,7 +69,6 @@ require ( github.com/richardlehane/match v1.0.5 // indirect github.com/richardlehane/mscfb v1.0.4 // indirect github.com/richardlehane/msoleps v1.0.3 // indirect - github.com/richardlehane/siegfried v1.11.1 // indirect github.com/richardlehane/xmldetect v1.0.2 // indirect github.com/robfig/cron v1.2.0 // indirect github.com/ross-spencer/spargo v0.4.1 // indirect diff --git a/go.sum b/go.sum index 95e3e119..9dc543fd 100644 --- a/go.sum +++ b/go.sum @@ -266,6 +266,8 @@ go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU= +go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= diff --git a/hack/kube/overlays/dev/preprocessing-secret.yaml b/hack/kube/overlays/dev/preprocessing-secret.yaml index 3a74561a..59d5c94f 100644 --- a/hack/kube/overlays/dev/preprocessing-secret.yaml +++ b/hack/kube/overlays/dev/preprocessing-secret.yaml @@ -50,6 +50,9 @@ stringData: [fileformat] allowlistPath = "/home/preprocessing/.config/allowed_file_formats.csv" + [filevalidate.verapdf] + path = "/opt/verapdf/verapdf" + allowed_file_formats.csv: | Format name,PRONOM PUID text,x-fmt/16 diff --git a/hack/make/dep_mockgen.mk b/hack/make/dep_mockgen.mk new file mode 100644 index 00000000..c9a9962a --- /dev/null +++ b/hack/make/dep_mockgen.mk @@ -0,0 +1,16 @@ +$(call _assert_var,MAKEDIR) +$(call _conditional_include,$(MAKEDIR)/base.mk) +$(call _assert_var,CACHE_VERSIONS) +$(call _assert_var,CACHE_BIN) + +MOCKGEN_VERSION ?= 0.4.0 + +MOCKGEN := $(CACHE_VERSIONS)/mockgen/$(MOCKGEN_VERSION) +$(MOCKGEN): + rm -f $(CACHE_BIN)/mockgen + mkdir -p $(CACHE_BIN) + env GOBIN=$(CACHE_BIN) go install go.uber.org/mock/mockgen@v$(MOCKGEN_VERSION) + chmod +x $(CACHE_BIN)/mockgen + rm -rf $(dir $(MOCKGEN)) + mkdir -p $(dir $(MOCKGEN)) + touch $(MOCKGEN) diff --git a/internal/activities/validate_files.go b/internal/activities/validate_files.go new file mode 100644 index 00000000..593c8ca0 --- /dev/null +++ b/internal/activities/validate_files.go @@ -0,0 +1,129 @@ +package activities + +import ( + "context" + "errors" + "fmt" + "io/fs" + "path/filepath" + "slices" + + "go.artefactual.dev/tools/temporal" + + "github.com/artefactual-sdps/preprocessing-sfa/internal/fformat" + "github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate" + "github.com/artefactual-sdps/preprocessing-sfa/internal/sip" +) + +const ValidateFilesName = "validate-files" + +type ( + ValidateFiles struct { + identifier fformat.Identifier + validators []fvalidate.Validator + } + ValidateFilesParams struct { + SIP sip.SIP + } + ValidateFilesResult struct { + Failures []string + } +) + +type fileFormats map[string]*fformat.FileFormat + +func NewValidateFiles(idr fformat.Identifier, vdrs ...fvalidate.Validator) *ValidateFiles { + return &ValidateFiles{ + identifier: idr, + validators: vdrs, + } +} + +// Execute validates SIP files against a file format specification. The +// only format validator currently implemented verapdf for PDF/A. +func (a *ValidateFiles) Execute(ctx context.Context, params *ValidateFilesParams) (*ValidateFilesResult, error) { + formats, err := a.identifyFormats(ctx, params.SIP) + if err != nil { + return nil, fmt.Errorf("identifyFormats: %v", err) + } + + failures, err := a.validateFiles(params.SIP, formats) + if err != nil { + return nil, fmt.Errorf("validateFiles: %v", err) + } + + return &ValidateFilesResult{Failures: failures}, nil +} + +func (a *ValidateFiles) identifyFormats(ctx context.Context, sip sip.SIP) (fileFormats, error) { + logger := temporal.GetLogger(ctx) + formats := make(fileFormats) + err := filepath.WalkDir(sip.ContentPath, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + if ctx.Err() != nil { + return errors.New("context cancelled") + } + + if d.IsDir() { + return nil + } + + ff, err := a.identifier.Identify(path) + if err != nil { + logger.Info("format identication failed", "path", path) + } else { + formats[path] = ff + } + + return nil + }) + if err != nil { + return nil, err + } + + return formats, nil +} + +func (a *ValidateFiles) validateFiles( + sip sip.SIP, + files fileFormats, +) ([]string, error) { + var failures []string + for _, v := range a.validators { + out, err := validate(v, sip.ContentPath, files) + if err != nil { + return nil, err + } + if out != "" { + failures = append(failures, out) + } + } + + return failures, nil +} + +func validate(v fvalidate.Validator, path string, files fileFormats) (string, error) { + var canValidate bool + allowedIds := v.FormatIDs() + + for _, f := range files { + if slices.Contains(allowedIds, f.ID) { + canValidate = true + break + } + } + + if !canValidate { + return "", nil + } + + out, err := v.Validate(path) + if err != nil { + return "", err + } + + return out, nil +} diff --git a/internal/activities/validate_files_test.go b/internal/activities/validate_files_test.go new file mode 100644 index 00000000..f1768392 --- /dev/null +++ b/internal/activities/validate_files_test.go @@ -0,0 +1,222 @@ +package activities_test + +import ( + "errors" + "fmt" + "path/filepath" + "testing" + + temporalsdk_activity "go.temporal.io/sdk/activity" + temporalsdk_testsuite "go.temporal.io/sdk/testsuite" + "go.uber.org/mock/gomock" + "gotest.tools/v3/assert" + "gotest.tools/v3/fs" + + "github.com/artefactual-sdps/preprocessing-sfa/internal/activities" + "github.com/artefactual-sdps/preprocessing-sfa/internal/fformat" + fake_fformat "github.com/artefactual-sdps/preprocessing-sfa/internal/fformat/fake" + fake_fvalidate "github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate/fake" + "github.com/artefactual-sdps/preprocessing-sfa/internal/sip" +) + +func TestValidateFiles(t *testing.T) { + t.Parallel() + + digitizedAIP, err := sip.New(fs.NewDir(t, "", + fs.WithDir("additional", + fs.WithFile("UpdatedAreldaMetadata.xml", ""), + ), + fs.WithDir("content", + fs.WithDir("content", + fs.WithDir("d_0000001", + fs.WithFile("test.pdf", ""), + fs.WithFile("Prozess_Digitalisierung_PREMIS.xml", ""), + ), + ), + fs.WithDir("header", + fs.WithDir("old", + fs.WithDir("SIP", + fs.WithFile("metadata.xml", ""), + ), + ), + fs.WithDir("xsd", + fs.WithFile("arelda.xsd", ""), + ), + ), + ), + ).Path()) + assert.NilError(t, err) + + tests := []struct { + name string + params activities.ValidateFilesParams + expectId func(*fake_fformat.MockIdentifierMockRecorder) + expectVld func(*fake_fvalidate.MockValidatorMockRecorder) + want activities.ValidateFilesResult + wantErr string + }{ + { + name: "Validates a PDF/A file", + params: activities.ValidateFilesParams{SIP: digitizedAIP}, + expectId: func(m *fake_fformat.MockIdentifierMockRecorder) { + m.Identify( + filepath.Join(digitizedAIP.ContentPath, "d_0000001", "test.pdf"), + ).Return( + &fformat.FileFormat{ + Namespace: "PRONOM", + ID: "fmt/354", + }, + nil, + ) + m.Identify( + filepath.Join(digitizedAIP.ContentPath, "d_0000001", "Prozess_Digitalisierung_PREMIS.xml"), + ).Return( + &fformat.FileFormat{ + Namespace: "PRONOM", + ID: "fmt/101", + }, + nil, + ) + }, + expectVld: func(m *fake_fvalidate.MockValidatorMockRecorder) { + m.FormatIDs().Return([]string{"fmt/354"}) + m.Validate(digitizedAIP.ContentPath).Return("", nil) + }, + }, + { + name: "Reports PDF validation errors", + params: activities.ValidateFilesParams{SIP: digitizedAIP}, + expectId: func(m *fake_fformat.MockIdentifierMockRecorder) { + m.Identify( + filepath.Join(digitizedAIP.ContentPath, "d_0000001", "test.pdf"), + ).Return( + &fformat.FileFormat{ + Namespace: "PRONOM", + ID: "fmt/354", + }, + nil, + ) + m.Identify( + filepath.Join(digitizedAIP.ContentPath, "d_0000001", "Prozess_Digitalisierung_PREMIS.xml"), + ).Return( + &fformat.FileFormat{ + Namespace: "PRONOM", + ID: "fmt/101", + }, + nil, + ) + }, + expectVld: func(m *fake_fvalidate.MockValidatorMockRecorder) { + m.FormatIDs().Return([]string{"fmt/354"}) + m.Validate(digitizedAIP.ContentPath).Return( + "One or more PDF/A files are invalid", + nil, + ) + }, + want: activities.ValidateFilesResult{ + Failures: []string{"One or more PDF/A files are invalid"}, + }, + }, + { + name: "Errors on application error", + params: activities.ValidateFilesParams{SIP: digitizedAIP}, + expectId: func(m *fake_fformat.MockIdentifierMockRecorder) { + m.Identify( + filepath.Join(digitizedAIP.ContentPath, "d_0000001", "test.pdf"), + ).Return( + &fformat.FileFormat{ + Namespace: "PRONOM", + ID: "fmt/354", + }, + nil, + ) + m.Identify( + filepath.Join(digitizedAIP.ContentPath, "d_0000001", "Prozess_Digitalisierung_PREMIS.xml"), + ).Return( + &fformat.FileFormat{ + Namespace: "PRONOM", + ID: "fmt/101", + }, + nil, + ) + }, + expectVld: func(m *fake_fvalidate.MockValidatorMockRecorder) { + m.FormatIDs().Return([]string{"fmt/354"}) + m.Validate(digitizedAIP.ContentPath).Return( + "", + errors.New("can't open /fake/path: permission denied"), + ) + }, + wantErr: "can't open /fake/path: permission denied", + }, + { + name: "Skip validation when format identification fails", + params: activities.ValidateFilesParams{SIP: digitizedAIP}, + expectId: func(m *fake_fformat.MockIdentifierMockRecorder) { + m.Identify( + filepath.Join(digitizedAIP.ContentPath, "d_0000001", "test.pdf"), + ).Return( + nil, + fmt.Errorf( + "multiple file formats matched: %s", + filepath.Join(digitizedAIP.ContentPath, "d_0000001", "test.pdf"), + ), + ) + m.Identify( + filepath.Join(digitizedAIP.ContentPath, "d_0000001", "Prozess_Digitalisierung_PREMIS.xml"), + ).Return( + &fformat.FileFormat{ + Namespace: "PRONOM", + ID: "fmt/101", + }, + nil, + ) + }, + expectVld: func(m *fake_fvalidate.MockValidatorMockRecorder) { + m.FormatIDs().Return([]string{"fmt/354"}) + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + ts := &temporalsdk_testsuite.WorkflowTestSuite{} + env := ts.NewTestActivityEnvironment() + + ctrl := gomock.NewController(t) + + mockIdr := fake_fformat.NewMockIdentifier(ctrl) + if tt.expectId != nil { + tt.expectId(mockIdr.EXPECT()) + } + + mockVdr := fake_fvalidate.NewMockValidator(ctrl) + if tt.expectVld != nil { + tt.expectVld(mockVdr.EXPECT()) + } + + env.RegisterActivityWithOptions( + activities.NewValidateFiles(mockIdr, mockVdr).Execute, + temporalsdk_activity.RegisterOptions{Name: activities.ValidateFilesName}, + ) + + enc, err := env.ExecuteActivity(activities.ValidateFilesName, tt.params) + if tt.wantErr != "" { + if err == nil { + t.Errorf("error is nil, expecting: %q", tt.wantErr) + } else { + assert.ErrorContains(t, err, tt.wantErr) + } + + return + } + assert.NilError(t, err) + + var result activities.ValidateFilesResult + _ = enc.Get(&result) + + assert.DeepEqual(t, result, tt.want) + }) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index 77db3f5f..93520dfe 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,6 +11,7 @@ import ( "github.com/spf13/viper" "github.com/artefactual-sdps/preprocessing-sfa/internal/ais" + "github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate" ) type ConfigurationValidator interface { @@ -34,11 +35,12 @@ type Configuration struct { // Enduro and preservation processing. SharedPath string - Temporal Temporal - Worker WorkerConfig - Bagit bagcreate.Config - AIS ais.Config - FileFormat ffvalidate.Config + Temporal Temporal + Worker WorkerConfig + Bagit bagcreate.Config + AIS ais.Config + FileFormat ffvalidate.Config + FileValidate fvalidate.Config } type Temporal struct { diff --git a/internal/fformat/fake/mock_identifier.go b/internal/fformat/fake/mock_identifier.go new file mode 100644 index 00000000..c0e1bb4b --- /dev/null +++ b/internal/fformat/fake/mock_identifier.go @@ -0,0 +1,117 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/artefactual-sdps/preprocessing-sfa/internal/fformat (interfaces: Identifier) +// +// Generated by this command: +// +// mockgen -typed -destination=./internal/fformat/fake/mock_identifier.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fformat Identifier +// + +// Package fake is a generated GoMock package. +package fake + +import ( + reflect "reflect" + + fformat "github.com/artefactual-sdps/preprocessing-sfa/internal/fformat" + gomock "go.uber.org/mock/gomock" +) + +// MockIdentifier is a mock of Identifier interface. +type MockIdentifier struct { + ctrl *gomock.Controller + recorder *MockIdentifierMockRecorder +} + +// MockIdentifierMockRecorder is the mock recorder for MockIdentifier. +type MockIdentifierMockRecorder struct { + mock *MockIdentifier +} + +// NewMockIdentifier creates a new mock instance. +func NewMockIdentifier(ctrl *gomock.Controller) *MockIdentifier { + mock := &MockIdentifier{ctrl: ctrl} + mock.recorder = &MockIdentifierMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockIdentifier) EXPECT() *MockIdentifierMockRecorder { + return m.recorder +} + +// Identify mocks base method. +func (m *MockIdentifier) Identify(arg0 string) (*fformat.FileFormat, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Identify", arg0) + ret0, _ := ret[0].(*fformat.FileFormat) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Identify indicates an expected call of Identify. +func (mr *MockIdentifierMockRecorder) Identify(arg0 any) *MockIdentifierIdentifyCall { + mr.mock.ctrl.T.Helper() + call := mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Identify", reflect.TypeOf((*MockIdentifier)(nil).Identify), arg0) + return &MockIdentifierIdentifyCall{Call: call} +} + +// MockIdentifierIdentifyCall wrap *gomock.Call +type MockIdentifierIdentifyCall struct { + *gomock.Call +} + +// Return rewrite *gomock.Call.Return +func (c *MockIdentifierIdentifyCall) Return(arg0 *fformat.FileFormat, arg1 error) *MockIdentifierIdentifyCall { + c.Call = c.Call.Return(arg0, arg1) + return c +} + +// Do rewrite *gomock.Call.Do +func (c *MockIdentifierIdentifyCall) Do(f func(string) (*fformat.FileFormat, error)) *MockIdentifierIdentifyCall { + c.Call = c.Call.Do(f) + return c +} + +// DoAndReturn rewrite *gomock.Call.DoAndReturn +func (c *MockIdentifierIdentifyCall) DoAndReturn(f func(string) (*fformat.FileFormat, error)) *MockIdentifierIdentifyCall { + c.Call = c.Call.DoAndReturn(f) + return c +} + +// Version mocks base method. +func (m *MockIdentifier) Version() string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Version") + ret0, _ := ret[0].(string) + return ret0 +} + +// Version indicates an expected call of Version. +func (mr *MockIdentifierMockRecorder) Version() *MockIdentifierVersionCall { + mr.mock.ctrl.T.Helper() + call := mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Version", reflect.TypeOf((*MockIdentifier)(nil).Version)) + return &MockIdentifierVersionCall{Call: call} +} + +// MockIdentifierVersionCall wrap *gomock.Call +type MockIdentifierVersionCall struct { + *gomock.Call +} + +// Return rewrite *gomock.Call.Return +func (c *MockIdentifierVersionCall) Return(arg0 string) *MockIdentifierVersionCall { + c.Call = c.Call.Return(arg0) + return c +} + +// Do rewrite *gomock.Call.Do +func (c *MockIdentifierVersionCall) Do(f func() string) *MockIdentifierVersionCall { + c.Call = c.Call.Do(f) + return c +} + +// DoAndReturn rewrite *gomock.Call.DoAndReturn +func (c *MockIdentifierVersionCall) DoAndReturn(f func() string) *MockIdentifierVersionCall { + c.Call = c.Call.DoAndReturn(f) + return c +} diff --git a/internal/fformat/fake/mock_validator.go b/internal/fformat/fake/mock_validator.go new file mode 100644 index 00000000..d92eee9b --- /dev/null +++ b/internal/fformat/fake/mock_validator.go @@ -0,0 +1,154 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/artefactual-sdps/preprocessing-sfa/internal/fformat (interfaces: Validator) +// +// Generated by this command: +// +// mockgen -typed -destination=./internal/fformat/fake/mock_validator.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fformat Validator +// + +// Package fake is a generated GoMock package. +package fake + +import ( + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockValidator is a mock of Validator interface. +type MockValidator struct { + ctrl *gomock.Controller + recorder *MockValidatorMockRecorder +} + +// MockValidatorMockRecorder is the mock recorder for MockValidator. +type MockValidatorMockRecorder struct { + mock *MockValidator +} + +// NewMockValidator creates a new mock instance. +func NewMockValidator(ctrl *gomock.Controller) *MockValidator { + mock := &MockValidator{ctrl: ctrl} + mock.recorder = &MockValidatorMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockValidator) EXPECT() *MockValidatorMockRecorder { + return m.recorder +} + +// FormatIDs mocks base method. +func (m *MockValidator) FormatIDs() []string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "FormatIDs") + ret0, _ := ret[0].([]string) + return ret0 +} + +// FormatIDs indicates an expected call of FormatIDs. +func (mr *MockValidatorMockRecorder) FormatIDs() *MockValidatorFormatIDsCall { + mr.mock.ctrl.T.Helper() + call := mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FormatIDs", reflect.TypeOf((*MockValidator)(nil).FormatIDs)) + return &MockValidatorFormatIDsCall{Call: call} +} + +// MockValidatorFormatIDsCall wrap *gomock.Call +type MockValidatorFormatIDsCall struct { + *gomock.Call +} + +// Return rewrite *gomock.Call.Return +func (c *MockValidatorFormatIDsCall) Return(arg0 []string) *MockValidatorFormatIDsCall { + c.Call = c.Call.Return(arg0) + return c +} + +// Do rewrite *gomock.Call.Do +func (c *MockValidatorFormatIDsCall) Do(f func() []string) *MockValidatorFormatIDsCall { + c.Call = c.Call.Do(f) + return c +} + +// DoAndReturn rewrite *gomock.Call.DoAndReturn +func (c *MockValidatorFormatIDsCall) DoAndReturn(f func() []string) *MockValidatorFormatIDsCall { + c.Call = c.Call.DoAndReturn(f) + return c +} + +// Name mocks base method. +func (m *MockValidator) Name() string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Name") + ret0, _ := ret[0].(string) + return ret0 +} + +// Name indicates an expected call of Name. +func (mr *MockValidatorMockRecorder) Name() *MockValidatorNameCall { + mr.mock.ctrl.T.Helper() + call := mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockValidator)(nil).Name)) + return &MockValidatorNameCall{Call: call} +} + +// MockValidatorNameCall wrap *gomock.Call +type MockValidatorNameCall struct { + *gomock.Call +} + +// Return rewrite *gomock.Call.Return +func (c *MockValidatorNameCall) Return(arg0 string) *MockValidatorNameCall { + c.Call = c.Call.Return(arg0) + return c +} + +// Do rewrite *gomock.Call.Do +func (c *MockValidatorNameCall) Do(f func() string) *MockValidatorNameCall { + c.Call = c.Call.Do(f) + return c +} + +// DoAndReturn rewrite *gomock.Call.DoAndReturn +func (c *MockValidatorNameCall) DoAndReturn(f func() string) *MockValidatorNameCall { + c.Call = c.Call.DoAndReturn(f) + return c +} + +// Validate mocks base method. +func (m *MockValidator) Validate(arg0 string) ([]byte, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Validate", arg0) + ret0, _ := ret[0].([]byte) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Validate indicates an expected call of Validate. +func (mr *MockValidatorMockRecorder) Validate(arg0 any) *MockValidatorValidateCall { + mr.mock.ctrl.T.Helper() + call := mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Validate", reflect.TypeOf((*MockValidator)(nil).Validate), arg0) + return &MockValidatorValidateCall{Call: call} +} + +// MockValidatorValidateCall wrap *gomock.Call +type MockValidatorValidateCall struct { + *gomock.Call +} + +// Return rewrite *gomock.Call.Return +func (c *MockValidatorValidateCall) Return(arg0 []byte, arg1 error) *MockValidatorValidateCall { + c.Call = c.Call.Return(arg0, arg1) + return c +} + +// Do rewrite *gomock.Call.Do +func (c *MockValidatorValidateCall) Do(f func(string) ([]byte, error)) *MockValidatorValidateCall { + c.Call = c.Call.Do(f) + return c +} + +// DoAndReturn rewrite *gomock.Call.DoAndReturn +func (c *MockValidatorValidateCall) DoAndReturn(f func(string) ([]byte, error)) *MockValidatorValidateCall { + c.Call = c.Call.DoAndReturn(f) + return c +} diff --git a/internal/fformat/fformat.go b/internal/fformat/fformat.go new file mode 100644 index 00000000..29cfcf71 --- /dev/null +++ b/internal/fformat/fformat.go @@ -0,0 +1,21 @@ +package fformat + +// Identifier provides a interface for identifying a file's format. +type Identifier interface { + // Identify returns a file format identification for the file at path. + Identify(path string) (*FileFormat, error) + + // Version returns the file format identification software version. + Version() string +} + +// A FileFormat represents a file format. +type FileFormat struct { + Namespace string // Namespace of the format identifier (e.g. "PRONOM"). + ID string // ID is the unique format identifier (e.g. "fmt/40"). + CommonName string // Common name of the format (e.g. "Microsoft Word Document"). + Version string // Version of the format (e.g. "97-2003"). + MIMEType string // MIMEType of the format (e.g. "application/msword"). + Basis string // Basis for identification of the format (e.g. "magic"). + Warning string // Warning message (if any) from the format identifier. +} diff --git a/internal/fformat/siegfried_embed.go b/internal/fformat/siegfried_embed.go new file mode 100644 index 00000000..9ac2095f --- /dev/null +++ b/internal/fformat/siegfried_embed.go @@ -0,0 +1,75 @@ +package fformat + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/richardlehane/siegfried" + "github.com/richardlehane/siegfried/pkg/config" + "github.com/richardlehane/siegfried/pkg/static" +) + +// SiegfriedEmbed is an implementation of Siegfried based on the library dist. +// It should be the fastest implementation because it loads just once. +type siegfriedEmbed struct { + embed *siegfried.Siegfried + version string + signature string +} + +var _ Identifier = (*siegfriedEmbed)(nil) + +func NewSiegfriedEmbed() *siegfriedEmbed { + v := config.Version() + return &siegfriedEmbed{ + embed: static.New(), + version: fmt.Sprintf("%d.%d.%d", v[0], v[1], v[2]), + signature: config.SignatureBase(), + } +} + +// Identify runs the Siegfried PRONOM file identifier on the file at path and +// returns a FileFormat pointer or an error. +func (sf *siegfriedEmbed) Identify(path string) (*FileFormat, error) { + f, err := os.Open(filepath.Clean(path)) + if err != nil { + return nil, err + } + defer f.Close() + + ids, err := sf.embed.Identify(f, f.Name(), "") + if err != nil { + return nil, err + } + if len(ids) > 1 { + return nil, fmt.Errorf("multiple file formats matched: %q", path) + } + + // Loop through Siegfried identifier result key-value pairs + var res FileFormat + for _, kv := range sf.embed.Label(ids[0]) { + switch kv[0] { + case "namespace": + res.Namespace = kv[1] + case "id": + res.ID = kv[1] + case "format": + res.CommonName = kv[1] + case "version": + res.Version = kv[1] + case "mime": + res.MIMEType = kv[1] + case "basis": + res.Basis = kv[1] + case "warning": + res.Warning = kv[1] + } + } + + return &res, nil +} + +func (s siegfriedEmbed) Version() string { + return s.version +} diff --git a/internal/fformat/siegfried_embed_test.go b/internal/fformat/siegfried_embed_test.go new file mode 100644 index 00000000..4e217696 --- /dev/null +++ b/internal/fformat/siegfried_embed_test.go @@ -0,0 +1,48 @@ +package fformat_test + +import ( + "testing" + + "gotest.tools/v3/assert" + + "github.com/artefactual-sdps/preprocessing-sfa/internal/fformat" +) + +func TestSiegfriedEmbed(t *testing.T) { + t.Parallel() + + t.Run("Identifies a file", func(t *testing.T) { + t.Parallel() + + sf := fformat.NewSiegfriedEmbed() + + got, err := sf.Identify("siegfried_embed.go") + assert.NilError(t, err) + assert.DeepEqual(t, got, &fformat.FileFormat{ + Namespace: "pronom", + ID: "x-fmt/111", + CommonName: "Plain Text File", + MIMEType: "text/plain", + Basis: "text match ASCII", + Warning: "match on text only; extension mismatch", + }) + }) + + t.Run("Errors when file not found", func(t *testing.T) { + t.Parallel() + + sf := fformat.NewSiegfriedEmbed() + _, err := sf.Identify("foobar.txt") + assert.Error(t, err, "open foobar.txt: no such file or directory") + }) +} + +func BenchmarkSiegfried(b *testing.B) { + b.Run("SiegfriedEmbed", func(b *testing.B) { + sf := fformat.NewSiegfriedEmbed() + b.ResetTimer() + for i := 0; i < b.N; i++ { + sf.Identify("fformat.go") + } + }) +} diff --git a/internal/fvalidate/config.go b/internal/fvalidate/config.go new file mode 100644 index 00000000..98784cec --- /dev/null +++ b/internal/fvalidate/config.go @@ -0,0 +1,9 @@ +package fvalidate + +type Config struct { + VeraPDF VeraPDFConfig +} + +type VeraPDFConfig struct { + Path string +} diff --git a/internal/fvalidate/fake/mock_validator.go b/internal/fvalidate/fake/mock_validator.go new file mode 100644 index 00000000..4aa37615 --- /dev/null +++ b/internal/fvalidate/fake/mock_validator.go @@ -0,0 +1,154 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate (interfaces: Validator) +// +// Generated by this command: +// +// mockgen -typed -destination=./internal/fvalidate/fake/mock_validator.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate Validator +// + +// Package fake is a generated GoMock package. +package fake + +import ( + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockValidator is a mock of Validator interface. +type MockValidator struct { + ctrl *gomock.Controller + recorder *MockValidatorMockRecorder +} + +// MockValidatorMockRecorder is the mock recorder for MockValidator. +type MockValidatorMockRecorder struct { + mock *MockValidator +} + +// NewMockValidator creates a new mock instance. +func NewMockValidator(ctrl *gomock.Controller) *MockValidator { + mock := &MockValidator{ctrl: ctrl} + mock.recorder = &MockValidatorMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockValidator) EXPECT() *MockValidatorMockRecorder { + return m.recorder +} + +// FormatIDs mocks base method. +func (m *MockValidator) FormatIDs() []string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "FormatIDs") + ret0, _ := ret[0].([]string) + return ret0 +} + +// FormatIDs indicates an expected call of FormatIDs. +func (mr *MockValidatorMockRecorder) FormatIDs() *MockValidatorFormatIDsCall { + mr.mock.ctrl.T.Helper() + call := mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FormatIDs", reflect.TypeOf((*MockValidator)(nil).FormatIDs)) + return &MockValidatorFormatIDsCall{Call: call} +} + +// MockValidatorFormatIDsCall wrap *gomock.Call +type MockValidatorFormatIDsCall struct { + *gomock.Call +} + +// Return rewrite *gomock.Call.Return +func (c *MockValidatorFormatIDsCall) Return(arg0 []string) *MockValidatorFormatIDsCall { + c.Call = c.Call.Return(arg0) + return c +} + +// Do rewrite *gomock.Call.Do +func (c *MockValidatorFormatIDsCall) Do(f func() []string) *MockValidatorFormatIDsCall { + c.Call = c.Call.Do(f) + return c +} + +// DoAndReturn rewrite *gomock.Call.DoAndReturn +func (c *MockValidatorFormatIDsCall) DoAndReturn(f func() []string) *MockValidatorFormatIDsCall { + c.Call = c.Call.DoAndReturn(f) + return c +} + +// Name mocks base method. +func (m *MockValidator) Name() string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Name") + ret0, _ := ret[0].(string) + return ret0 +} + +// Name indicates an expected call of Name. +func (mr *MockValidatorMockRecorder) Name() *MockValidatorNameCall { + mr.mock.ctrl.T.Helper() + call := mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockValidator)(nil).Name)) + return &MockValidatorNameCall{Call: call} +} + +// MockValidatorNameCall wrap *gomock.Call +type MockValidatorNameCall struct { + *gomock.Call +} + +// Return rewrite *gomock.Call.Return +func (c *MockValidatorNameCall) Return(arg0 string) *MockValidatorNameCall { + c.Call = c.Call.Return(arg0) + return c +} + +// Do rewrite *gomock.Call.Do +func (c *MockValidatorNameCall) Do(f func() string) *MockValidatorNameCall { + c.Call = c.Call.Do(f) + return c +} + +// DoAndReturn rewrite *gomock.Call.DoAndReturn +func (c *MockValidatorNameCall) DoAndReturn(f func() string) *MockValidatorNameCall { + c.Call = c.Call.DoAndReturn(f) + return c +} + +// Validate mocks base method. +func (m *MockValidator) Validate(arg0 string) (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Validate", arg0) + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Validate indicates an expected call of Validate. +func (mr *MockValidatorMockRecorder) Validate(arg0 any) *MockValidatorValidateCall { + mr.mock.ctrl.T.Helper() + call := mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Validate", reflect.TypeOf((*MockValidator)(nil).Validate), arg0) + return &MockValidatorValidateCall{Call: call} +} + +// MockValidatorValidateCall wrap *gomock.Call +type MockValidatorValidateCall struct { + *gomock.Call +} + +// Return rewrite *gomock.Call.Return +func (c *MockValidatorValidateCall) Return(arg0 string, arg1 error) *MockValidatorValidateCall { + c.Call = c.Call.Return(arg0, arg1) + return c +} + +// Do rewrite *gomock.Call.Do +func (c *MockValidatorValidateCall) Do(f func(string) (string, error)) *MockValidatorValidateCall { + c.Call = c.Call.Do(f) + return c +} + +// DoAndReturn rewrite *gomock.Call.DoAndReturn +func (c *MockValidatorValidateCall) DoAndReturn(f func(string) (string, error)) *MockValidatorValidateCall { + c.Call = c.Call.DoAndReturn(f) + return c +} diff --git a/internal/fvalidate/fvalidate.go b/internal/fvalidate/fvalidate.go new file mode 100644 index 00000000..f0c3854d --- /dev/null +++ b/internal/fvalidate/fvalidate.go @@ -0,0 +1,13 @@ +package fvalidate + +// Validator provides an interface for validating a file's format. +type Validator interface { + // FormatIDs lists the format IDs that the validator can validate. + FormatIDs() []string + + // Name of the validator. + Name() string + + // Validate validates the file at path. + Validate(path string) (string, error) +} diff --git a/internal/fvalidate/verapdf_validator.go b/internal/fvalidate/verapdf_validator.go new file mode 100644 index 00000000..a50804bf --- /dev/null +++ b/internal/fvalidate/verapdf_validator.go @@ -0,0 +1,84 @@ +package fvalidate + +import ( + "errors" + "fmt" + "os/exec" + + "github.com/go-logr/logr" + + "github.com/artefactual-sdps/preprocessing-sfa/internal/fsutil" +) + +// pdfaPUIDs are the https://www.nationalarchives.gov.uk/pronom/ IDs of the +// PDF/A formats. +var pdfaPUIDs = []string{ + "fmt/95", // PDF/A 1a + "fmt/354", // PDF/A 1b + "fmt/476", // PDF/A 2a + "fmt/477", // PDF/A 2b + "fmt/478", // PDF/A 2u + "fmt/479", // PDF/A 3a + "fmt/480", // PDF/A 3b + "fmt/481", // PDF/A 3u + "fmt/1910", // PDF/A 4 + "fmt/1911", // PDF/A 4e + "fmt/1912", // PDF/A 4f +} + +type veraPDFValidator struct { + cmd string + logger logr.Logger +} + +var _ Validator = (*veraPDFValidator)(nil) + +func NewVeraPDFValidator(cmd string, logger logr.Logger) *veraPDFValidator { + return &veraPDFValidator{cmd: cmd, logger: logger} +} + +func (v *veraPDFValidator) FormatIDs() []string { + return pdfaPUIDs +} + +func (v *veraPDFValidator) Name() string { + return "veraPDF" +} + +func (v *veraPDFValidator) Validate(path string) (string, error) { + // If the veraPDF cmd path is not set then skip validation. + if v.cmd == "" { + return "", nil + } + + if !fsutil.FileExists(path) { + return "", fmt.Errorf("validate: file not found: %s", path) + } + + cmd := exec.Command(v.cmd, "--recurse", path) // #nosec: G204 -- trusted path. + + _, err := cmd.Output() + if err == nil { // error IS nil. + return "", nil + } + + e, ok := err.(*exec.ExitError) + if !ok { + return "", err + } + + switch e.ExitCode() { + case 1: + // Exit code 1 indicates a validation error, and there is no + // STDERR. In this case the full validation report is written to + // STDOUT, but we are ignoring it right now because it is very + // long. + return "One or more PDF/A files are invalid", nil + + default: + // Other exit codes (e.g. file not found) should write an error + // message to STDERR. + v.logger.Info("veraPDF validate", "exit code", e.ExitCode(), "STDERR", string(e.Stderr)) + return "", errors.New("PDF/A validation failed with an application error") + } +} diff --git a/internal/fvalidate/verapdf_validator_test.go b/internal/fvalidate/verapdf_validator_test.go new file mode 100644 index 00000000..63612389 --- /dev/null +++ b/internal/fvalidate/verapdf_validator_test.go @@ -0,0 +1,96 @@ +package fvalidate_test + +import ( + "fmt" + "testing" + + "github.com/go-logr/logr" + "gotest.tools/v3/assert" + + "github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate" +) + +func TestFormatIDs(t *testing.T) { + t.Parallel() + + v := fvalidate.NewVeraPDFValidator("", logr.Discard()) + got := v.FormatIDs() + + assert.DeepEqual(t, got, []string{ + "fmt/95", // PDF/A 1a + "fmt/354", // PDF/A 1b + "fmt/476", // PDF/A 2a + "fmt/477", // PDF/A 2b + "fmt/478", // PDF/A 2u + "fmt/479", // PDF/A 3a + "fmt/480", // PDF/A 3b + "fmt/481", // PDF/A 3u + "fmt/1910", // PDF/A 4 + "fmt/1911", // PDF/A 4e + "fmt/1912", // PDF/A 4f + }) +} + +func TestName(t *testing.T) { + t.Parallel() + + v := fvalidate.NewVeraPDFValidator("", logr.Discard()) + got := v.Name() + + assert.Equal(t, got, "veraPDF") +} + +func empty(td string) string { + return "" +} + +func TestValidate(t *testing.T) { + t.Parallel() + + type test struct { + name string + cmd string + path func(td string) string + want func(td string) string + wantErr func(td string) string + } + for _, tt := range []test{ + { + name: "Does nothing when cmd is not set", + path: empty, + }, + { + name: "Errors when path doesn't exist", + cmd: "echo", + path: func(td string) string { return td + "/foo" }, + wantErr: func(td string) string { + return fmt.Sprintf("validate: file not found: %s/foo", td) + }, + }, + { + name: "Returns nothing when no error", + cmd: "echo", + path: func(td string) string { return td }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + v := fvalidate.NewVeraPDFValidator(tt.cmd, logr.Discard()) + td := t.TempDir() + got, err := v.Validate(tt.path(td)) + if tt.wantErr != nil { + assert.Error(t, err, tt.wantErr(td)) + return + } + + assert.NilError(t, err) + + if tt.want != nil { + assert.Equal(t, got, tt.want(td)) + } else { + assert.Equal(t, got, "") + } + }) + } +} diff --git a/internal/workflow/preprocessing.go b/internal/workflow/preprocessing.go index eb30072e..09ee73ae 100644 --- a/internal/workflow/preprocessing.go +++ b/internal/workflow/preprocessing.go @@ -180,7 +180,7 @@ func (w *PreprocessingWorkflow) Execute( checksumEv.Succeed(ctx, "SIP checksums match file contents") } - // Validate file formats. + // Verify that SIP file formats are on allowlist. ev = result.newEvent(ctx, "Validate SIP file formats") var ffvalidateResult ffvalidate.Result e = temporalsdk_workflow.ExecuteActivity( @@ -203,6 +203,37 @@ func (w *PreprocessingWorkflow) Execute( ev.Succeed(ctx, "No disallowed file formats found") } + // Validate the SIP files. + ev = result.newEvent(ctx, "Validate SIP files") + var validateFilesResult activities.ValidateFilesResult + e = temporalsdk_workflow.ExecuteActivity( + temporalsdk_workflow.WithActivityOptions( + ctx, + temporalsdk_workflow.ActivityOptions{ + ScheduleToCloseTimeout: time.Hour, + RetryPolicy: &temporalsdk_temporal.RetryPolicy{ + MaximumAttempts: 1, + }, + }, + ), + activities.ValidateFilesName, + &activities.ValidateFilesParams{SIP: identifySIP.SIP}, + ).Get(ctx, &validateFilesResult) + if e != nil { + result.systemError(ctx, e, ev, "System error: file validation has failed") + return result, nil + } + + if validateFilesResult.Failures != nil { + result.validationError( + ctx, + ev, + "file validation has failed. One or more files are invalid", validateFilesResult.Failures, + ) + } else { + ev.Succeed(ctx, "No invalid files found") + } + // Validate metadata. ev = result.newEvent(ctx, "Validate SIP metadata") var validateMetadata xmlvalidate.Result diff --git a/internal/workflow/preprocessing_test.go b/internal/workflow/preprocessing_test.go index 95119bb8..023c16b7 100644 --- a/internal/workflow/preprocessing_test.go +++ b/internal/workflow/preprocessing_test.go @@ -142,6 +142,10 @@ func (s *PreprocessingTestSuite) SetupTest(cfg *config.Configuration) { ffvalidate.New(ffvalidate.Config{}).Execute, temporalsdk_activity.RegisterOptions{Name: ffvalidate.Name}, ) + s.env.RegisterActivityWithOptions( + activities.NewValidateFiles(nil).Execute, + temporalsdk_activity.RegisterOptions{Name: activities.ValidateFilesName}, + ) s.env.RegisterActivityWithOptions( activities.NewAddPREMISObjects(rand.Reader).Execute, temporalsdk_activity.RegisterOptions{Name: activities.AddPREMISObjectsName}, @@ -255,6 +259,13 @@ func (s *PreprocessingTestSuite) TestPreprocessingWorkflowSuccess() { ).Return( &ffvalidate.Result{}, nil, ) + s.env.OnActivity( + activities.ValidateFilesName, + sessionCtx, + &activities.ValidateFilesParams{SIP: expectedSIP}, + ).Return( + &activities.ValidateFilesResult{}, nil, + ) s.env.OnActivity( xmlvalidate.Name, sessionCtx, @@ -407,6 +418,13 @@ func (s *PreprocessingTestSuite) TestPreprocessingWorkflowSuccess() { StartedAt: testTime, CompletedAt: testTime, }, + { + Name: "Validate SIP files", + Message: "No invalid files found", + Outcome: enums.EventOutcomeSuccess, + StartedAt: testTime, + CompletedAt: testTime, + }, { Name: "Validate SIP metadata", Message: "Metadata validation successful", @@ -539,11 +557,21 @@ func (s *PreprocessingTestSuite) TestPreprocessingWorkflowValidationFails() { &ffvalidate.Params{Path: expectedSIP.ContentPath}, ).Return( &ffvalidate.Result{Failures: []string{ - `file format fmt/11 not allowed: "fake/path/to/sip/dir/file1.png"`, - `file format fmt/11 not allowed: "fake/path/to/sip/file2.png"`, + `file format fmt/11 not allowed: "content/content/d_0000001/00000010.png"`, + `file format fmt/11 not allowed: "content/content/d_0000001/00000011.png"`, }}, nil, ) + s.env.OnActivity( + activities.ValidateFilesName, + sessionCtx, + &activities.ValidateFilesParams{SIP: expectedSIP}, + ).Return( + &activities.ValidateFilesResult{ + Failures: []string{`invalid PDF/A: "contents/contents/d_0000001/test.pdf"`}, + }, + nil, + ) s.env.OnActivity( xmlvalidate.Name, sessionCtx, @@ -610,8 +638,16 @@ Checksum mismatch for "content/content/d_0000001/00000001.jp2" (expected: "827cc { Name: "Validate SIP file formats", Message: `Content error: file format validation has failed. One or more file formats are not allowed: -file format fmt/11 not allowed: "fake/path/to/sip/dir/file1.png" -file format fmt/11 not allowed: "fake/path/to/sip/file2.png"`, +file format fmt/11 not allowed: "content/content/d_0000001/00000010.png" +file format fmt/11 not allowed: "content/content/d_0000001/00000011.png"`, + Outcome: enums.EventOutcomeValidationFailure, + StartedAt: testTime, + CompletedAt: testTime, + }, + { + Name: "Validate SIP files", + Message: `Content error: file validation has failed. One or more files are invalid: +invalid PDF/A: "contents/contents/d_0000001/test.pdf"`, Outcome: enums.EventOutcomeValidationFailure, StartedAt: testTime, CompletedAt: testTime, diff --git a/verapdf_report.json b/verapdf_report.json new file mode 100644 index 00000000..2762cdf1 --- /dev/null +++ b/verapdf_report.json @@ -0,0 +1,385 @@ +{ + "report": { + "buildInformation": { + "releaseDetails": [ + { + "id": "core", + "version": "1.26.1", + "buildDate": 1715877000000 + }, + { + "id": "validation-model", + "version": "1.26.1", + "buildDate": 1715883180000 + }, + { + "id": "gui", + "version": "1.26.2", + "buildDate": 1716125580000 + } + ] + }, + "jobs": [ + { + "itemDetails": { + "name": "/home/preprocessing/shared/SFA-Test-Born-Digital-AIP/content/content/d0001/p0003.pdf", + "size": 47975 + }, + "validationResult": { + "details": { + "passedRules": 133, + "failedRules": 1, + "passedChecks": 1689, + "failedChecks": 1, + "ruleSummaries": [ + { + "ruleStatus": "FAILED", + "specification": "ISO 19005-1:2005", + "clause": "6.8.4", + "testNumber": 1, + "status": "failed", + "failedChecks": 1, + "description": "If the Lang entry is present in the document catalog dictionary or in a structure element dictionary or property list, its value shall be a language identifier as defined by RFC 1766, Tags for the Identification of Languages, as described in PDF Reference 9.8.1", + "object": "CosLang", + "test": "unicodeValue == '' || /^[a-zA-Z]{1,8}(-[a-zA-Z]{1,8})*$/.test(unicodeValue)", + "checks": [ + { + "status": "failed", + "context": "root/document[0]/StructTreeRoot[0](19 0 obj PDStructTreeRoot)/K[0](21 0 obj SESect Sect)/Lang[0]", + "errorMessage": "Value DE of the Lang entry is not a Language-Tag", + "errorArguments": [ + "DE\\u0000" + ] + } + ] + } + ] + }, + "jobEndStatus": "normal", + "profileName": "PDF/A-1A validation profile", + "statement": "PDF file is not compliant with Validation Profile requirements.", + "compliant": false + }, + "processingTime": { + "start": 1732751712851, + "finish": 1732751713045, + "duration": "00:00:00.194", + "difference": 194 + } + }, + { + "itemDetails": { + "name": "/home/preprocessing/shared/SFA-Test-Born-Digital-AIP/content/content/d0001/p0001.pdf", + "size": 49001 + }, + "validationResult": { + "details": { + "passedRules": 133, + "failedRules": 1, + "passedChecks": 1696, + "failedChecks": 1, + "ruleSummaries": [ + { + "ruleStatus": "FAILED", + "specification": "ISO 19005-1:2005", + "clause": "6.8.4", + "testNumber": 1, + "status": "failed", + "failedChecks": 1, + "description": "If the Lang entry is present in the document catalog dictionary or in a structure element dictionary or property list, its value shall be a language identifier as defined by RFC 1766, Tags for the Identification of Languages, as described in PDF Reference 9.8.1", + "object": "CosLang", + "test": "unicodeValue == '' || /^[a-zA-Z]{1,8}(-[a-zA-Z]{1,8})*$/.test(unicodeValue)", + "checks": [ + { + "status": "failed", + "context": "root/document[0]/StructTreeRoot[0](19 0 obj PDStructTreeRoot)/K[0](21 0 obj SESect Sect)/Lang[0]", + "errorMessage": "Value DE of the Lang entry is not a Language-Tag", + "errorArguments": [ + "DE\\u0000" + ] + } + ] + } + ] + }, + "jobEndStatus": "normal", + "profileName": "PDF/A-1A validation profile", + "statement": "PDF file is not compliant with Validation Profile requirements.", + "compliant": false + }, + "processingTime": { + "start": 1732751713056, + "finish": 1732751713073, + "duration": "00:00:00.017", + "difference": 17 + } + }, + { + "itemDetails": { + "name": "/home/preprocessing/shared/SFA-Test-Born-Digital-AIP/content/content/d0001/p0002.pdf", + "size": 46808 + }, + "validationResult": { + "details": { + "passedRules": 133, + "failedRules": 1, + "passedChecks": 1675, + "failedChecks": 1, + "ruleSummaries": [ + { + "ruleStatus": "FAILED", + "specification": "ISO 19005-1:2005", + "clause": "6.8.4", + "testNumber": 1, + "status": "failed", + "failedChecks": 1, + "description": "If the Lang entry is present in the document catalog dictionary or in a structure element dictionary or property list, its value shall be a language identifier as defined by RFC 1766, Tags for the Identification of Languages, as described in PDF Reference 9.8.1", + "object": "CosLang", + "test": "unicodeValue == '' || /^[a-zA-Z]{1,8}(-[a-zA-Z]{1,8})*$/.test(unicodeValue)", + "checks": [ + { + "status": "failed", + "context": "root/document[0]/StructTreeRoot[0](19 0 obj PDStructTreeRoot)/K[0](21 0 obj SESect Sect)/Lang[0]", + "errorMessage": "Value DE of the Lang entry is not a Language-Tag", + "errorArguments": [ + "DE\\u0000" + ] + } + ] + } + ] + }, + "jobEndStatus": "normal", + "profileName": "PDF/A-1A validation profile", + "statement": "PDF file is not compliant with Validation Profile requirements.", + "compliant": false + }, + "processingTime": { + "start": 1732751713075, + "finish": 1732751713091, + "duration": "00:00:00.016", + "difference": 16 + } + }, + { + "itemDetails": { + "name": "/home/preprocessing/shared/SFA-Test-Born-Digital-AIP/content/content/d0002/p0004.pdf", + "size": 48932 + }, + "validationResult": { + "details": { + "passedRules": 133, + "failedRules": 1, + "passedChecks": 1700, + "failedChecks": 1, + "ruleSummaries": [ + { + "ruleStatus": "FAILED", + "specification": "ISO 19005-1:2005", + "clause": "6.8.4", + "testNumber": 1, + "status": "failed", + "failedChecks": 1, + "description": "If the Lang entry is present in the document catalog dictionary or in a structure element dictionary or property list, its value shall be a language identifier as defined by RFC 1766, Tags for the Identification of Languages, as described in PDF Reference 9.8.1", + "object": "CosLang", + "test": "unicodeValue == '' || /^[a-zA-Z]{1,8}(-[a-zA-Z]{1,8})*$/.test(unicodeValue)", + "checks": [ + { + "status": "failed", + "context": "root/document[0]/StructTreeRoot[0](19 0 obj PDStructTreeRoot)/K[0](21 0 obj SESect Sect)/Lang[0]", + "errorMessage": "Value DE of the Lang entry is not a Language-Tag", + "errorArguments": [ + "DE\\u0000" + ] + } + ] + } + ] + }, + "jobEndStatus": "normal", + "profileName": "PDF/A-1A validation profile", + "statement": "PDF file is not compliant with Validation Profile requirements.", + "compliant": false + }, + "processingTime": { + "start": 1732751713093, + "finish": 1732751713110, + "duration": "00:00:00.017", + "difference": 17 + } + }, + { + "itemDetails": { + "name": "/home/preprocessing/shared/SFA-Test-Born-Digital-AIP/content/content/d0002/p0005.pdf", + "size": 49076 + }, + "validationResult": { + "details": { + "passedRules": 133, + "failedRules": 1, + "passedChecks": 1709, + "failedChecks": 1, + "ruleSummaries": [ + { + "ruleStatus": "FAILED", + "specification": "ISO 19005-1:2005", + "clause": "6.8.4", + "testNumber": 1, + "status": "failed", + "failedChecks": 1, + "description": "If the Lang entry is present in the document catalog dictionary or in a structure element dictionary or property list, its value shall be a language identifier as defined by RFC 1766, Tags for the Identification of Languages, as described in PDF Reference 9.8.1", + "object": "CosLang", + "test": "unicodeValue == '' || /^[a-zA-Z]{1,8}(-[a-zA-Z]{1,8})*$/.test(unicodeValue)", + "checks": [ + { + "status": "failed", + "context": "root/document[0]/StructTreeRoot[0](19 0 obj PDStructTreeRoot)/K[0](21 0 obj SESect Sect)/Lang[0]", + "errorMessage": "Value DE of the Lang entry is not a Language-Tag", + "errorArguments": [ + "DE\\u0000" + ] + } + ] + } + ] + }, + "jobEndStatus": "normal", + "profileName": "PDF/A-1A validation profile", + "statement": "PDF file is not compliant with Validation Profile requirements.", + "compliant": false + }, + "processingTime": { + "start": 1732751713111, + "finish": 1732751713131, + "duration": "00:00:00.020", + "difference": 20 + } + }, + { + "itemDetails": { + "name": "/home/preprocessing/shared/SFA-Test-Born-Digital-AIP/content/content/d0003/d0004/p0006.pdf", + "size": 47900 + }, + "validationResult": { + "details": { + "passedRules": 133, + "failedRules": 1, + "passedChecks": 1698, + "failedChecks": 1, + "ruleSummaries": [ + { + "ruleStatus": "FAILED", + "specification": "ISO 19005-1:2005", + "clause": "6.8.4", + "testNumber": 1, + "status": "failed", + "failedChecks": 1, + "description": "If the Lang entry is present in the document catalog dictionary or in a structure element dictionary or property list, its value shall be a language identifier as defined by RFC 1766, Tags for the Identification of Languages, as described in PDF Reference 9.8.1", + "object": "CosLang", + "test": "unicodeValue == '' || /^[a-zA-Z]{1,8}(-[a-zA-Z]{1,8})*$/.test(unicodeValue)", + "checks": [ + { + "status": "failed", + "context": "root/document[0]/StructTreeRoot[0](19 0 obj PDStructTreeRoot)/K[0](21 0 obj SESect Sect)/Lang[0]", + "errorMessage": "Value DE of the Lang entry is not a Language-Tag", + "errorArguments": [ + "DE\\u0000" + ] + } + ] + } + ] + }, + "jobEndStatus": "normal", + "profileName": "PDF/A-1A validation profile", + "statement": "PDF file is not compliant with Validation Profile requirements.", + "compliant": false + }, + "processingTime": { + "start": 1732751713132, + "finish": 1732751713146, + "duration": "00:00:00.014", + "difference": 14 + } + }, + { + "itemDetails": { + "name": "/home/preprocessing/shared/SFA-Test-Born-Digital-AIP/content/content/d0003/d0004/p0007.pdf", + "size": 46218 + }, + "validationResult": { + "details": { + "passedRules": 133, + "failedRules": 1, + "passedChecks": 1649, + "failedChecks": 1, + "ruleSummaries": [ + { + "ruleStatus": "FAILED", + "specification": "ISO 19005-1:2005", + "clause": "6.8.4", + "testNumber": 1, + "status": "failed", + "failedChecks": 1, + "description": "If the Lang entry is present in the document catalog dictionary or in a structure element dictionary or property list, its value shall be a language identifier as defined by RFC 1766, Tags for the Identification of Languages, as described in PDF Reference 9.8.1", + "object": "CosLang", + "test": "unicodeValue == '' || /^[a-zA-Z]{1,8}(-[a-zA-Z]{1,8})*$/.test(unicodeValue)", + "checks": [ + { + "status": "failed", + "context": "root/document[0]/StructTreeRoot[0](19 0 obj PDStructTreeRoot)/K[0](21 0 obj SESect Sect)/Lang[0]", + "errorMessage": "Value DE of the Lang entry is not a Language-Tag", + "errorArguments": [ + "DE\\u0000" + ] + } + ] + } + ] + }, + "jobEndStatus": "normal", + "profileName": "PDF/A-1A validation profile", + "statement": "PDF file is not compliant with Validation Profile requirements.", + "compliant": false + }, + "processingTime": { + "start": 1732751713147, + "finish": 1732751713160, + "duration": "00:00:00.013", + "difference": 13 + } + } + ], + "batchSummary": { + "duration": { + "start": 1732751712806, + "finish": 1732751713161, + "duration": "00:00:00.355", + "difference": 355 + }, + "totalJobs": 7, + "outOfMemory": 0, + "veraExceptions": 0, + "failedParsingJobs": 0, + "failedEncryptedJobs": 0, + "validationSummary": { + "nonCompliantPdfaCount": 7, + "failedJobCount": 0, + "totalJobCount": 7, + "compliantPdfaCount": 0, + "successfulJobCount": 7 + }, + "multiJob": true, + "featuresSummary": { + "failedJobCount": 0, + "totalJobCount": 0, + "successfulJobCount": 0 + }, + "repairSummary": { + "failedJobCount": 0, + "totalJobCount": 0, + "successfulJobCount": 0 + } + } + } +}