-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add veraPDF validation of PDF/A files
- Add verapdf and the JRE to the worker Docker image - Add a validate_files activity to identify SIP file formats then validate the file formats for which we have a validator - Copy siegfried_embed and the format Identifier interface from https://github.com/artefactual-sdps/temporal-activities - Add the fvalidate package and Validator interface - Add a veraPDF implementation of the Validator interface - Run veraPDF in "batch" mode to minimize startup overheads - Add file validation configuration to config file - Add processing events for file validation success and failure - Add veraPDF binary path to kube dev overlay
- Loading branch information
Showing
23 changed files
with
1,640 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
$(call _assert_var,MAKEDIR) | ||
$(call _conditional_include,$(MAKEDIR)/base.mk) | ||
$(call _assert_var,CACHE_VERSIONS) | ||
$(call _assert_var,CACHE_BIN) | ||
|
||
MOCKGEN_VERSION ?= 0.4.0 | ||
|
||
MOCKGEN := $(CACHE_VERSIONS)/mockgen/$(MOCKGEN_VERSION) | ||
$(MOCKGEN): | ||
rm -f $(CACHE_BIN)/mockgen | ||
mkdir -p $(CACHE_BIN) | ||
env GOBIN=$(CACHE_BIN) go install go.uber.org/mock/mockgen@v$(MOCKGEN_VERSION) | ||
chmod +x $(CACHE_BIN)/mockgen | ||
rm -rf $(dir $(MOCKGEN)) | ||
mkdir -p $(dir $(MOCKGEN)) | ||
touch $(MOCKGEN) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
package activities | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
"fmt" | ||
"io/fs" | ||
"path/filepath" | ||
"slices" | ||
|
||
"go.artefactual.dev/tools/temporal" | ||
|
||
"github.com/artefactual-sdps/preprocessing-sfa/internal/fformat" | ||
"github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate" | ||
"github.com/artefactual-sdps/preprocessing-sfa/internal/sip" | ||
) | ||
|
||
const ValidateFilesName = "validate-files" | ||
|
||
type ( | ||
ValidateFiles struct { | ||
identifier fformat.Identifier | ||
validators []fvalidate.Validator | ||
} | ||
ValidateFilesParams struct { | ||
SIP sip.SIP | ||
} | ||
ValidateFilesResult struct { | ||
Failures []string | ||
} | ||
) | ||
|
||
type fileFormats map[string]*fformat.FileFormat | ||
|
||
func NewValidateFiles(idr fformat.Identifier, vdrs ...fvalidate.Validator) *ValidateFiles { | ||
return &ValidateFiles{ | ||
identifier: idr, | ||
validators: vdrs, | ||
} | ||
} | ||
|
||
// Execute validates SIP files against a file format specification. The | ||
// only format validator currently implemented verapdf for PDF/A. | ||
func (a *ValidateFiles) Execute(ctx context.Context, params *ValidateFilesParams) (*ValidateFilesResult, error) { | ||
formats, err := a.identifyFormats(ctx, params.SIP) | ||
if err != nil { | ||
return nil, fmt.Errorf("identifyFormats: %v", err) | ||
} | ||
|
||
failures, err := a.validateFiles(params.SIP, formats) | ||
if err != nil { | ||
return nil, fmt.Errorf("validateFiles: %v", err) | ||
} | ||
|
||
return &ValidateFilesResult{Failures: failures}, nil | ||
} | ||
|
||
func (a *ValidateFiles) identifyFormats(ctx context.Context, sip sip.SIP) (fileFormats, error) { | ||
logger := temporal.GetLogger(ctx) | ||
formats := make(fileFormats) | ||
err := filepath.WalkDir(sip.ContentPath, func(path string, d fs.DirEntry, err error) error { | ||
if err != nil { | ||
return err | ||
} | ||
|
||
if ctx.Err() != nil { | ||
return errors.New("context cancelled") | ||
} | ||
|
||
if d.IsDir() { | ||
return nil | ||
} | ||
|
||
ff, err := a.identifier.Identify(path) | ||
if err != nil { | ||
logger.Info("format identication failed", "path", path) | ||
} else { | ||
formats[path] = ff | ||
} | ||
|
||
return nil | ||
}) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return formats, nil | ||
} | ||
|
||
func (a *ValidateFiles) validateFiles( | ||
sip sip.SIP, | ||
files fileFormats, | ||
) ([]string, error) { | ||
var failures []string | ||
for _, v := range a.validators { | ||
out, err := validate(v, sip.ContentPath, files) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if out != "" { | ||
failures = append(failures, out) | ||
} | ||
} | ||
|
||
return failures, nil | ||
} | ||
|
||
func validate(v fvalidate.Validator, path string, files fileFormats) (string, error) { | ||
var canValidate bool | ||
allowedIds := v.FormatIDs() | ||
|
||
for _, f := range files { | ||
if slices.Contains(allowedIds, f.ID) { | ||
canValidate = true | ||
break | ||
} | ||
} | ||
|
||
if !canValidate { | ||
return "", nil | ||
} | ||
|
||
out, err := v.Validate(path) | ||
if err != nil { | ||
return "", err | ||
} | ||
|
||
return out, nil | ||
} |
Oops, something went wrong.