Skip to content

Commit

Permalink
Read allowed file formats from a CSV file
Browse files Browse the repository at this point in the history
Fixes #60

- Add a "fileFormat" section to the preprocessing config file
- Add a "AllowlistPath" config option to the "fileFormat" section
- If no "AllowlistPath" is configured, then file format validation will
  be skipped
- Add an allowed format CSV file to the dev and enduro kube configs
  • Loading branch information
djjuhasz committed Oct 15, 2024
1 parent c81f4e2 commit cb669bf
Show file tree
Hide file tree
Showing 11 changed files with 286 additions and 57 deletions.
2 changes: 1 addition & 1 deletion cmd/worker/workercmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func (m *Main) Run(ctx context.Context) error {
temporalsdk_activity.RegisterOptions{Name: activities.VerifyManifestName},
)
w.RegisterActivityWithOptions(
activities.NewValidateFileFormats().Execute,
activities.NewValidateFileFormats(m.cfg.FileFormat).Execute,
temporalsdk_activity.RegisterOptions{Name: activities.ValidateFileFormatsName},
)
w.RegisterActivityWithOptions(
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ require (
github.com/spf13/pflag v1.0.5
github.com/spf13/viper v1.18.2
github.com/stretchr/testify v1.9.0
github.com/tonglil/buflogr v1.1.1
go.artefactual.dev/tools v0.14.0
go.temporal.io/api v1.32.0
go.temporal.io/sdk v1.26.1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
github.com/tonglil/buflogr v1.1.1 h1:CKAjOHBSMmgbRFxpn/RhQHPj5oANc7ekhlsoUDvcZIg=
github.com/tonglil/buflogr v1.1.1/go.mod h1:WLLtPRLqcFYWQLbA+ytXy5WrFTYnfA+beg1MpvJCxm4=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
Expand Down
39 changes: 39 additions & 0 deletions hack/kube/overlays/dev/preprocessing-secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,42 @@ stringData:
secretKey = "minio123"
region = "us-west-1"
bucket = "ais"
[fileformat]
allowlistPath = "/home/preprocessing/.config/allowed_file_formats.csv"
allowed_file_formats.csv: |
PRONOM_ID
fmt/95
x-fmt/16
x-fmt/21
x-fmt/22
x-fmt/62
x-fmt/111
x-fmt/282
x-fmt/283
fmt/354
fmt/476
fmt/477
fmt/478
x-fmt/18
fmt/161
fmt/1196
fmt/1777
fmt/353
x-fmt/392
fmt/1
fmt/2
fmt/6
fmt/141
fmt/569
fmt/199
fmt/101
fmt/142
x-fmt/280
fmt/1014
fmt/1012
fmt/654
fmt/1013
fmt/1011
fmt/653
39 changes: 39 additions & 0 deletions hack/kube/overlays/enduro/preprocessing-secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,42 @@ stringData:
secretKey = "minio123"
region = "us-west-1"
bucket = "ais"
[fileFormat]
allowlistPath = "/home/preprocessing/.config/allowed_file_formats.csv"
allowed_file_formats.csv: |
PRONOM_ID
fmt/95
x-fmt/16
x-fmt/21
x-fmt/22
x-fmt/62
x-fmt/111
x-fmt/282
x-fmt/283
fmt/354
fmt/476
fmt/477
fmt/478
x-fmt/18
fmt/161
fmt/1196
fmt/1777
fmt/353
x-fmt/392
fmt/1
fmt/2
fmt/6
fmt/141
fmt/569
fmt/199
fmt/101
fmt/142
x-fmt/280
fmt/1014
fmt/1012
fmt/654
fmt/1013
fmt/1011
fmt/653
115 changes: 68 additions & 47 deletions internal/activities/validate_file_formats.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,75 +2,65 @@ package activities

import (
"context"
"encoding/csv"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"

"go.artefactual.dev/tools/temporal"

"github.com/artefactual-sdps/preprocessing-sfa/internal/fformat"
"github.com/artefactual-sdps/preprocessing-sfa/internal/sip"
)

const ValidateFileFormatsName = "validate-file-formats"

type ValidateFileFormatsParams struct {
SIP sip.SIP
}

type ValidateFileFormatsResult struct {
Failures []string
}
type (
ValidateFileFormats struct {
cfg fformat.Config
}
ValidateFileFormatsParams struct {
SIP sip.SIP
}
ValidateFileFormatsResult struct {
Failures []string
}
)

type ValidateFileFormats struct{}
type formatList map[string]struct{}

func NewValidateFileFormats() *ValidateFileFormats {
return &ValidateFileFormats{}
func NewValidateFileFormats(cfg fformat.Config) *ValidateFileFormats {
return &ValidateFileFormats{cfg: cfg}
}

func (a *ValidateFileFormats) Execute(
ctx context.Context,
params *ValidateFileFormatsParams,
) (*ValidateFileFormatsResult, error) {
var failures []string
logger := temporal.GetLogger(ctx)

sf := fformat.NewSiegfriedEmbed()
// TODO(daniel): make allowed list configurable.
allowed := map[string]struct{}{
"fmt/95": {},
"x-fmt/16": {},
"x-fmt/21": {},
"x-fmt/22": {},
"x-fmt/62": {},
"x-fmt/111": {},
"x-fmt/282": {},
"x-fmt/283": {},
"fmt/354": {},
"fmt/476": {},
"fmt/477": {},
"fmt/478": {},
"x-fmt/18": {},
"fmt/161": {},
"fmt/1196": {},
"fmt/1777": {},
"fmt/353": {},
"x-fmt/392": {},
"fmt/1": {},
"fmt/2": {},
"fmt/6": {},
"fmt/141": {},
"fmt/569": {},
"fmt/199": {},
"fmt/101": {},
"fmt/142": {},
"x-fmt/280": {},
"fmt/1014": {},
"fmt/1012": {},
"fmt/654": {},
"fmt/1013": {},
"fmt/1011": {},
"fmt/653": {},
if a.cfg.AllowlistPath == "" {
logger.Info("ValidateFileFormats: No file format allowlist path set, skipping file format validation")

return nil, nil
}

f, err := os.Open(a.cfg.AllowlistPath)
if err != nil {
return nil, fmt.Errorf("ValidateFileFormats: %v", err)
}
defer f.Close()

err := filepath.WalkDir(params.SIP.ContentPath, func(p string, d fs.DirEntry, err error) error {
allowed, err := parseFormatList(f)
if err != nil {
return nil, fmt.Errorf("ValidateFileFormats: load allowed formats: %v", err)
}

sf := fformat.NewSiegfriedEmbed()
err = filepath.WalkDir(params.SIP.ContentPath, func(p string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
Expand All @@ -96,3 +86,34 @@ func (a *ValidateFileFormats) Execute(

return &ValidateFileFormatsResult{Failures: failures}, nil
}

func parseFormatList(r io.Reader) (formatList, error) {
var i int
formats := make(formatList)

cr := csv.NewReader(r)
for {
row, err := cr.Read()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("invalid CSV: %v", err)
}

// Skip the first row.
if i > 0 {
// Get the file format identifier from the first column of each row
// and ignore subsequent columns.
formats[row[0]] = struct{}{}
}

i++
}

if len(formats) == 0 {
return nil, fmt.Errorf("no allowed file formats")
}

return formats, nil
}
Loading

0 comments on commit cb669bf

Please sign in to comment.