Skip to content

Commit

Permalink
Add veraPDF validation of PDF/A files
Browse files Browse the repository at this point in the history
- Add verapdf and the JRE to the worker Docker image
- Add a validate_files activity to identify SIP file formats then
  validate the file formats for which we have a validator
- Copy siegfried_embed and the format Identifier interface from
  https://github.com/artefactual-sdps/temporal-activities
- Add the fvalidate package and Validator interface
- Add a veraPDF implementation of the Validator interface
- Run veraPDF in "batch" mode to minimize startup overheads
- Add file validation configuration to config file
- Add processing events for file validation success and failure
- Add veraPDF binary path to kube dev overlay
  • Loading branch information
djjuhasz committed Dec 3, 2024
1 parent b5a1f33 commit 458d34b
Show file tree
Hide file tree
Showing 23 changed files with 1,640 additions and 11 deletions.
13 changes: 13 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,29 @@ RUN --mount=type=cache,target=/go/pkg/mod \
-o /out/preprocessing-worker \
./cmd/worker

# Build worker image
FROM alpine:3.20 AS preprocessing-worker
RUN apk add --update --no-cache libxml2-utils

# Copy the JRE (Eclipse Temurin v11) from the verapdf/cli image
ENV JAVA_HOME=/opt/java/openjdk
ENV PATH="${JAVA_HOME}/bin:${PATH}"
COPY --from=ghcr.io/verapdf/cli:v1.27.96 --link $JAVA_HOME $JAVA_HOME

ARG USER_ID=1000
ARG GROUP_ID=1000
RUN addgroup -g ${GROUP_ID} -S preprocessing
RUN adduser -u ${USER_ID} -S -D preprocessing preprocessing

# Make preprocessing the owner of the verapdf log dir
RUN mkdir --parents /var/opt/verapdf/logs && chown -R preprocessing:preprocessing /var/opt/verapdf

USER preprocessing

COPY --from=build-preprocessing-worker --link /out/preprocessing-worker /home/preprocessing/bin/preprocessing-worker
RUN mkdir /home/preprocessing/shared

# Copy the veraPDF application (v1.26.2) from the verapdf/cli image
COPY --from=ghcr.io/verapdf/cli:v1.27.96 --link /opt/verapdf/ /opt/verapdf/

CMD ["/home/preprocessing/bin/preprocessing-worker"]
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ include hack/make/dep_golines.mk
include hack/make/dep_gomajor.mk
include hack/make/dep_gosec.mk
include hack/make/dep_gotestsum.mk
include hack/make/dep_mockgen.mk
include hack/make/dep_shfmt.mk
include hack/make/dep_tparse.mk
include hack/make/enums.mk
Expand All @@ -30,6 +31,7 @@ TOOLS = $(GOLANGCI_LINT) \
$(GOMAJOR) \
$(GOSEC) \
$(GOTESTSUM) \
$(MOCKGEN) \
$(SHFMT) \
$(TPARSE)

Expand All @@ -40,7 +42,9 @@ endef

IGNORED_PACKAGES := \
github.com/artefactual-sdps/preprocessing-sfa/hack/% \
github.com/artefactual-sdps/preprocessing-sfa/internal/%/fake \
github.com/artefactual-sdps/preprocessing-sfa/internal/enums

PACKAGES := $(shell go list ./...)
TEST_PACKAGES := $(filter-out $(IGNORED_PACKAGES),$(PACKAGES))
TEST_IGNORED_PACKAGES := $(filter $(IGNORED_PACKAGES),$(PACKAGES))
Expand All @@ -55,6 +59,11 @@ deps: # @HELP List available module dependency updates.
deps: $(GOMAJOR)
gomajor list

gen-mock: # @HELP Generate mocks.
gen-mock: $(MOCKGEN)
mockgen -typed -destination=./internal/fformat/fake/mock_identifier.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fformat Identifier
mockgen -typed -destination=./internal/fvalidate/fake/mock_validator.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate Validator

golines: # @HELP Run the golines formatter to fix long lines.
golines: GOLINES_OUT_MODE ?= write-output
golines: $(GOLINES)
Expand Down
9 changes: 9 additions & 0 deletions cmd/worker/workercmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import (

"github.com/artefactual-sdps/preprocessing-sfa/internal/activities"
"github.com/artefactual-sdps/preprocessing-sfa/internal/config"
"github.com/artefactual-sdps/preprocessing-sfa/internal/fformat"
"github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate"
"github.com/artefactual-sdps/preprocessing-sfa/internal/workflow"
)

Expand Down Expand Up @@ -78,6 +80,13 @@ func (m *Main) Run(ctx context.Context) error {
ffvalidate.New(m.cfg.FileFormat).Execute,
temporalsdk_activity.RegisterOptions{Name: ffvalidate.Name},
)
w.RegisterActivityWithOptions(
activities.NewValidateFiles(
fformat.NewSiegfriedEmbed(),
fvalidate.NewVeraPDFValidator(m.cfg.FileValidate.VeraPDF.Path, m.logger),
).Execute,
temporalsdk_activity.RegisterOptions{Name: activities.ValidateFilesName},
)
w.RegisterActivityWithOptions(
activities.NewAddPREMISObjects(rand.Reader).Execute,
temporalsdk_activity.RegisterOptions{Name: activities.AddPREMISObjectsName},
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ require (
github.com/google/uuid v1.6.0
github.com/hashicorp/go-cleanhttp v0.5.2
github.com/oklog/run v1.1.0
github.com/richardlehane/siegfried v1.11.1
github.com/spf13/pflag v1.0.5
github.com/spf13/viper v1.18.2
github.com/stretchr/testify v1.9.0
go.artefactual.dev/tools v0.14.0
go.temporal.io/sdk v1.26.1
go.uber.org/mock v0.4.0
gocloud.dev v0.39.0
gotest.tools/v3 v3.5.1
)
Expand Down Expand Up @@ -67,7 +69,6 @@ require (
github.com/richardlehane/match v1.0.5 // indirect
github.com/richardlehane/mscfb v1.0.4 // indirect
github.com/richardlehane/msoleps v1.0.3 // indirect
github.com/richardlehane/siegfried v1.11.1 // indirect
github.com/richardlehane/xmldetect v1.0.2 // indirect
github.com/robfig/cron v1.2.0 // indirect
github.com/ross-spencer/spargo v0.4.1 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,8 @@ go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU=
go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
Expand Down
3 changes: 3 additions & 0 deletions hack/kube/overlays/dev/preprocessing-secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ stringData:
[fileformat]
allowlistPath = "/home/preprocessing/.config/allowed_file_formats.csv"
[filevalidate.verapdf]
path = "/opt/verapdf/verapdf"
allowed_file_formats.csv: |
Format name,PRONOM PUID
text,x-fmt/16
Expand Down
16 changes: 16 additions & 0 deletions hack/make/dep_mockgen.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
$(call _assert_var,MAKEDIR)
$(call _conditional_include,$(MAKEDIR)/base.mk)
$(call _assert_var,CACHE_VERSIONS)
$(call _assert_var,CACHE_BIN)

MOCKGEN_VERSION ?= 0.4.0

MOCKGEN := $(CACHE_VERSIONS)/mockgen/$(MOCKGEN_VERSION)
$(MOCKGEN):
rm -f $(CACHE_BIN)/mockgen
mkdir -p $(CACHE_BIN)
env GOBIN=$(CACHE_BIN) go install go.uber.org/mock/mockgen@v$(MOCKGEN_VERSION)
chmod +x $(CACHE_BIN)/mockgen
rm -rf $(dir $(MOCKGEN))
mkdir -p $(dir $(MOCKGEN))
touch $(MOCKGEN)
129 changes: 129 additions & 0 deletions internal/activities/validate_files.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package activities

import (
"context"
"errors"
"fmt"
"io/fs"
"path/filepath"
"slices"

"go.artefactual.dev/tools/temporal"

"github.com/artefactual-sdps/preprocessing-sfa/internal/fformat"
"github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate"
"github.com/artefactual-sdps/preprocessing-sfa/internal/sip"
)

const ValidateFilesName = "validate-files"

type (
ValidateFiles struct {
identifier fformat.Identifier
validators []fvalidate.Validator
}
ValidateFilesParams struct {
SIP sip.SIP
}
ValidateFilesResult struct {
Failures []string
}
)

type fileFormats map[string]*fformat.FileFormat

func NewValidateFiles(idr fformat.Identifier, vdrs ...fvalidate.Validator) *ValidateFiles {
return &ValidateFiles{
identifier: idr,
validators: vdrs,
}
}

// Execute validates SIP files against a file format specification. The
// only format validator currently implemented verapdf for PDF/A.
func (a *ValidateFiles) Execute(ctx context.Context, params *ValidateFilesParams) (*ValidateFilesResult, error) {
formats, err := a.identifyFormats(ctx, params.SIP)
if err != nil {
return nil, fmt.Errorf("identifyFormats: %v", err)
}

failures, err := a.validateFiles(params.SIP, formats)
if err != nil {
return nil, fmt.Errorf("validateFiles: %v", err)
}

return &ValidateFilesResult{Failures: failures}, nil
}

func (a *ValidateFiles) identifyFormats(ctx context.Context, sip sip.SIP) (fileFormats, error) {
logger := temporal.GetLogger(ctx)
formats := make(fileFormats)
err := filepath.WalkDir(sip.ContentPath, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}

if ctx.Err() != nil {
return errors.New("context cancelled")
}

if d.IsDir() {
return nil
}

ff, err := a.identifier.Identify(path)
if err != nil {
logger.Info("format identication failed", "path", path)
} else {
formats[path] = ff
}

return nil
})
if err != nil {
return nil, err
}

return formats, nil
}

func (a *ValidateFiles) validateFiles(
sip sip.SIP,
files fileFormats,
) ([]string, error) {
var failures []string
for _, v := range a.validators {
out, err := validate(v, sip.ContentPath, files)
if err != nil {
return nil, err
}
if out != "" {
failures = append(failures, out)
}
}

return failures, nil
}

func validate(v fvalidate.Validator, path string, files fileFormats) (string, error) {
var canValidate bool
allowedIds := v.FormatIDs()

for _, f := range files {
if slices.Contains(allowedIds, f.ID) {
canValidate = true
break
}
}

if !canValidate {
return "", nil
}

out, err := v.Validate(path)
if err != nil {
return "", err
}

return out, nil
}
Loading

0 comments on commit 458d34b

Please sign in to comment.