Skip to content

Commit

Permalink
Combine PREMIS files from Vecteur AIPs
Browse files Browse the repository at this point in the history
  • Loading branch information
mcantelon authored and jraddaoui committed May 17, 2024
1 parent 95b103e commit a1ae576
Show file tree
Hide file tree
Showing 7 changed files with 186 additions and 0 deletions.
4 changes: 4 additions & 0 deletions cmd/worker/workercmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ func (m *Main) Run(ctx context.Context) error {
activities.NewMetadataValidationActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.MetadataValidationName},
)
w.RegisterActivityWithOptions(
activities.NewCombinePREMISActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.CombinePREMISName},
)
w.RegisterActivityWithOptions(
activities.NewSipCreationActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.SipCreationName},
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.22.3

require (
github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9
github.com/beevik/etree v1.4.0
github.com/go-logr/logr v1.4.1
github.com/nyudlts/go-bagit v0.3.0-alpha
github.com/otiai10/copy v1.14.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9 h1:JhybsG9MteMDlkXH/dHsENUon/8l9KY7VUc6F4XOOts=
github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9/go.mod h1:uf0jIGyZGHi3oTfhg+QkwCkyTaGhtAwromhwouC1FhU=
github.com/beevik/etree v1.4.0 h1:oz1UedHRepuY3p4N5OjE0nK1WLCqtzHf25bxplKOHLs=
github.com/beevik/etree v1.4.0/go.mod h1:cyWiXwGoasx60gHvtnEh5x8+uIjUVnjWqBvEnhnqKDA=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
Expand Down
5 changes: 5 additions & 0 deletions hack/sampledata/xsd/empty_premis.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<premis:premis version="3.0"
xmlns="http://www.loc.gov/premis/v3"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/premis/v3 http://www.loc.gov/standards/premis/premis.xsd">
</premis:premis>
152 changes: 152 additions & 0 deletions internal/activities/combine_premis.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
package activities

import (
"context"
"errors"
"os"
"path"
"strings"

"github.com/beevik/etree"
"github.com/otiai10/copy"
)

const CombinePREMISName = "combine-premis"

type CombinePREMISActivity struct{}

func NewCombinePREMISActivity() *CombinePREMISActivity {
return &CombinePREMISActivity{}
}

type CombinePREMISParams struct {
Path string
}

type CombinePREMISResult struct {
Out string
}

func (md *CombinePREMISActivity) Execute(
ctx context.Context,
params *CombinePREMISParams,
) (*CombinePREMISResult, error) {
// Get transfer's PREMIS file paths.
file_paths, err := CombinePREMISGetPaths(params.Path)
if err != nil {
return nil, err
}

// Copy empty PREMIS file into metadata directory.
source_filepath := "empty_premis.xml"
dest_filepath := path.Join(params.Path, "metadata/premis.xml")

err = copy.Copy(source_filepath, dest_filepath)
if err != nil {
return nil, err
}

// Write elements from transfer's PREMIS files to combined PREMIS file.
combined_premis_filepath := path.Join(params.Path, "metadata/premis.xml")

for _, file_path := range file_paths {
err := CombinePREMISCopy(file_path, combined_premis_filepath)
if err != nil {
return nil, err
}
}

res := &CombinePREMISResult{}
res.Out = "OK"
return res, nil
}

func CombinePREMISGetPaths(transfer_dir string) ([]string, error) {
dir_items, err := os.ReadDir(transfer_dir)
if err != nil {
return nil, err
}

file_paths := []string{}
for _, dir_item := range dir_items {
if dir_item.IsDir() {
subdir := path.Join(transfer_dir, dir_item.Name())

sub_items, err := os.ReadDir(subdir)
if err != nil {
return nil, err
}

for _, subdir_item := range sub_items {
if !subdir_item.IsDir() {
if strings.HasSuffix(strings.ToLower(subdir_item.Name()), "_premis.xml") {
file_paths = append(file_paths, path.Join(subdir, subdir_item.Name()))
}
}
}
}
}

return file_paths, nil
}

func CombinePREMISCopy(source_filepath string, destination_filepath string) error {
// Parse source document and get root PREMIS element.
source_doc := etree.NewDocument()

if err := source_doc.ReadFromFile(source_filepath); err != nil {
return err
}

source_premis_element := source_doc.FindElement("/premis")
if source_premis_element == nil {
return errors.New("no root premis element found in source document")
}

// Read source child PREMIS elements.
source_premis_object_elements := source_premis_element.FindElements("object")
source_premis_event_elements := source_premis_element.FindElements("event")
source_premis_agent_elements := source_premis_element.FindElements("agent")

// Parse destination document and get root PREMIS element.
dest_doc := etree.NewDocument()
if err := dest_doc.ReadFromFile(destination_filepath); err != nil {
return err
}

dest_premis_element := dest_doc.FindElement("/premis")
if dest_premis_element == nil {
return errors.New("no root premis element found in destination document")
}

// Update PREMIS originalname child elements of PREMIS object elements.
for _, premis_object_element := range source_premis_object_elements {
objectname_element := premis_object_element.FindElement("originalName")
if objectname_element != nil {
objectname_element.SetText("data/" + objectname_element.Text())
}
}

// Write destination child PREMIS elements.
CombinePREMISAddChildElements(dest_premis_element, source_premis_object_elements)
CombinePREMISAddChildElements(dest_premis_element, source_premis_event_elements)
CombinePREMISAddChildElements(dest_premis_element, source_premis_agent_elements)

dest_doc.Indent(2)
err := dest_doc.WriteToFile(destination_filepath)
if err != nil {
return err
}

return nil
}

func CombinePREMISAddChildElements(parent_element *etree.Element, new_child_elements []*etree.Element) {
for _, child_element := range new_child_elements {
child_element.Space = "premis"
for _, element := range child_element.FindElements("//*") {
element.Space = "premis"
}
parent_element.AddChild(child_element)
}
}
11 changes: 11 additions & 0 deletions internal/workflow/preprocessing.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,17 @@ func (w *PreprocessingWorkflow) Execute(
return nil, e
}

// Combine PREMIS files into one.
var combinePREMIS activities.CombinePREMISResult
e = temporalsdk_workflow.ExecuteActivity(
withLocalActOpts(ctx),
activities.CombinePREMISName,
&activities.CombinePREMISParams{Path: localPath},
).Get(ctx, &combinePREMIS)
if e != nil {
return nil, e
}

// Remove PREMIS XML files.
var removeFilesResult removefiles.ActivityResult
e = temporalsdk_workflow.ExecuteActivity(
Expand Down
11 changes: 11 additions & 0 deletions internal/workflow/preprocessing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ func (s *PreprocessingTestSuite) SetupTest(cfg config.Configuration) {
activities.NewTransformVecteurAIPActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.TransformVecteurAIPName},
)
s.env.RegisterActivityWithOptions(
activities.NewCombinePREMISActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.CombinePREMISName},
)
s.env.RegisterActivityWithOptions(
removefiles.NewActivity(removefiles.Config{}).Execute,
temporalsdk_activity.RegisterOptions{Name: removefiles.ActivityName},
Expand Down Expand Up @@ -174,6 +178,13 @@ func (s *PreprocessingTestSuite) TestVecteurAIP() {
).Return(
&activities.TransformVecteurAIPResult{}, nil,
)
s.env.OnActivity(
activities.CombinePREMISName,
sessionCtx,
&activities.CombinePREMISParams{Path: sipPath},
).Return(
&activities.CombinePREMISResult{}, nil,
)
s.env.OnActivity(
removefiles.ActivityName,
sessionCtx,
Expand Down

0 comments on commit a1ae576

Please sign in to comment.