Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combine PREMIS files from Vecteur AIPs #6

Merged
merged 2 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cmd/worker/workercmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ func (m *Main) Run(ctx context.Context) error {
activities.NewMetadataValidationActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.MetadataValidationName},
)
w.RegisterActivityWithOptions(
activities.NewCombinePREMISActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.CombinePREMISName},
)
w.RegisterActivityWithOptions(
activities.NewSipCreationActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.SipCreationName},
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.22.3

require (
github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9
github.com/beevik/etree v1.4.0
github.com/go-logr/logr v1.4.1
github.com/nyudlts/go-bagit v0.3.0-alpha
github.com/otiai10/copy v1.14.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9 h1:JhybsG9MteMDlkXH/dHsENUon/8l9KY7VUc6F4XOOts=
github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9/go.mod h1:uf0jIGyZGHi3oTfhg+QkwCkyTaGhtAwromhwouC1FhU=
github.com/beevik/etree v1.4.0 h1:oz1UedHRepuY3p4N5OjE0nK1WLCqtzHf25bxplKOHLs=
github.com/beevik/etree v1.4.0/go.mod h1:cyWiXwGoasx60gHvtnEh5x8+uIjUVnjWqBvEnhnqKDA=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
Expand Down
5 changes: 5 additions & 0 deletions hack/sampledata/xsd/empty_premis.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<premis:premis version="3.0"
xmlns="http://www.loc.gov/premis/v3"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/premis/v3 http://www.loc.gov/standards/premis/premis.xsd">
</premis:premis>
155 changes: 155 additions & 0 deletions internal/activities/combine_premis.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
package activities

import (
"context"
"errors"
"os"
"path"
"path/filepath"
"strings"

"github.com/beevik/etree"
"github.com/otiai10/copy"
)

const CombinePREMISName = "combine-premis"

type CombinePREMISActivity struct{}

func NewCombinePREMISActivity() *CombinePREMISActivity {
return &CombinePREMISActivity{}
}

type CombinePREMISParams struct {
Path string
}

type CombinePREMISResult struct {
Out string
}

func (md *CombinePREMISActivity) Execute(
ctx context.Context,
params *CombinePREMISParams,
) (*CombinePREMISResult, error) {
// Get transfer's PREMIS file paths.
file_paths, err := CombinePREMISGetPaths(params.Path)
if err != nil {
return nil, err
}

// Copy empty PREMIS file into metadata directory.
source_filepath := "empty_premis.xml"
dest_filepath := path.Join(params.Path, "metadata/premis.xml")

err = copy.Copy(source_filepath, dest_filepath)
if err != nil {
return nil, err
}

// Write elements from transfer's PREMIS files to combined PREMIS file.
combined_premis_filepath := path.Join(params.Path, "metadata/premis.xml")

for _, file_path := range file_paths {
err := CombinePREMISCopy(file_path, combined_premis_filepath)
if err != nil {
return nil, err
}
}

res := &CombinePREMISResult{}
res.Out = "OK"
return res, nil
}

func CombinePREMISGetPaths(transfer_dir string) ([]string, error) {
objects_dir := filepath.Join(transfer_dir, "objects")
dir_items, err := os.ReadDir(objects_dir)
if err != nil {
return nil, err
}

file_paths := []string{}
for _, dir_item := range dir_items {
if dir_item.IsDir() {
subdir := path.Join(objects_dir, dir_item.Name())

sub_items, err := os.ReadDir(subdir)
if err != nil {
return nil, err
}

for _, subdir_item := range sub_items {
if !subdir_item.IsDir() {
if strings.HasSuffix(strings.ToLower(subdir_item.Name()), "_premis.xml") {
file_paths = append(file_paths, path.Join(subdir, subdir_item.Name()))
}
}
}
}
}

return file_paths, nil
}

func CombinePREMISCopy(source_filepath, destination_filepath string) error {
// Parse source document and get root PREMIS element.
source_doc := etree.NewDocument()

if err := source_doc.ReadFromFile(source_filepath); err != nil {
return err
}

source_premis_element := source_doc.FindElement("/premis")
if source_premis_element == nil {
return errors.New("no root premis element found in source document")
}

// Read source child PREMIS elements.
source_premis_object_elements := source_premis_element.FindElements("object")
source_premis_event_elements := source_premis_element.FindElements("event")
source_premis_agent_elements := source_premis_element.FindElements("agent")

// Parse destination document and get root PREMIS element.
dest_doc := etree.NewDocument()
if err := dest_doc.ReadFromFile(destination_filepath); err != nil {
return err
}

dest_premis_element := dest_doc.FindElement("/premis")
if dest_premis_element == nil {
return errors.New("no root premis element found in destination document")
}

// Update PREMIS originalname child elements of PREMIS object elements.
dirName := filepath.Base(filepath.Dir(source_filepath))
for _, premis_object_element := range source_premis_object_elements {
objectname_element := premis_object_element.FindElement("originalName")
if objectname_element != nil {
objectname_element.SetText("objects/" + dirName + "/" + objectname_element.Text())
}
}

// Write destination child PREMIS elements.
CombinePREMISAddChildElements(dest_premis_element, source_premis_object_elements)
CombinePREMISAddChildElements(dest_premis_element, source_premis_event_elements)
CombinePREMISAddChildElements(dest_premis_element, source_premis_agent_elements)

dest_doc.Indent(2)
err := dest_doc.WriteToFile(destination_filepath)
if err != nil {
return err
}

return nil
}

func CombinePREMISAddChildElements(parent_element *etree.Element, new_child_elements []*etree.Element) {
for _, child_element := range new_child_elements {
child_element.Space = "premis"
for _, element := range child_element.FindElements("//*") {
element.Space = "premis"
}
parent_element.AddChild(child_element)
}
}
8 changes: 7 additions & 1 deletion internal/activities/transform_vecteur_aip.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,19 @@ func (a *TransformVecteurAIPActivity) Execute(
return nil, err
}

// Create objects directory.
objectsPath := filepath.Join(params.Path, "objects")
if err = os.Mkdir(objectsPath, 0o750); err != nil {
return nil, err
}

// Move all entries from content/content to root folder.
entries, err := os.ReadDir(contentPath)
if err != nil {
return nil, err
}
for _, entry := range entries {
err := fsutil.Move(filepath.Join(contentPath, entry.Name()), filepath.Join(params.Path, entry.Name()))
err := fsutil.Move(filepath.Join(contentPath, entry.Name()), filepath.Join(objectsPath, entry.Name()))
if err != nil {
return nil, err
}
Expand Down
11 changes: 11 additions & 0 deletions internal/workflow/preprocessing.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,17 @@ func (w *PreprocessingWorkflow) Execute(
return nil, e
}

// Combine PREMIS files into one.
var combinePREMIS activities.CombinePREMISResult
e = temporalsdk_workflow.ExecuteActivity(
withLocalActOpts(ctx),
activities.CombinePREMISName,
&activities.CombinePREMISParams{Path: localPath},
).Get(ctx, &combinePREMIS)
if e != nil {
return nil, e
}

// Remove PREMIS XML files.
var removeFilesResult removefiles.ActivityResult
e = temporalsdk_workflow.ExecuteActivity(
Expand Down
11 changes: 11 additions & 0 deletions internal/workflow/preprocessing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ func (s *PreprocessingTestSuite) SetupTest(cfg config.Configuration) {
activities.NewTransformVecteurAIPActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.TransformVecteurAIPName},
)
s.env.RegisterActivityWithOptions(
activities.NewCombinePREMISActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: activities.CombinePREMISName},
)
s.env.RegisterActivityWithOptions(
removefiles.NewActivity(removefiles.Config{}).Execute,
temporalsdk_activity.RegisterOptions{Name: removefiles.ActivityName},
Expand Down Expand Up @@ -174,6 +178,13 @@ func (s *PreprocessingTestSuite) TestVecteurAIP() {
).Return(
&activities.TransformVecteurAIPResult{}, nil,
)
s.env.OnActivity(
activities.CombinePREMISName,
sessionCtx,
&activities.CombinePREMISParams{Path: sipPath},
).Return(
&activities.CombinePREMISResult{}, nil,
)
s.env.OnActivity(
removefiles.ActivityName,
sessionCtx,
Expand Down
Loading