diff --git a/cmd/worker/workercmd/cmd.go b/cmd/worker/workercmd/cmd.go index 80b0ce19..9dd49497 100644 --- a/cmd/worker/workercmd/cmd.go +++ b/cmd/worker/workercmd/cmd.go @@ -71,6 +71,10 @@ func (m *Main) Run(ctx context.Context) error { activities.NewMetadataValidationActivity().Execute, temporalsdk_activity.RegisterOptions{Name: activities.MetadataValidationName}, ) + w.RegisterActivityWithOptions( + activities.NewCombinePREMISActivity().Execute, + temporalsdk_activity.RegisterOptions{Name: activities.CombinePREMISName}, + ) w.RegisterActivityWithOptions( activities.NewSipCreationActivity().Execute, temporalsdk_activity.RegisterOptions{Name: activities.SipCreationName}, diff --git a/go.mod b/go.mod index 9a666546..724b1acf 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.22.3 require ( github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9 + github.com/beevik/etree v1.4.0 github.com/go-logr/logr v1.4.1 github.com/nyudlts/go-bagit v0.3.0-alpha github.com/otiai10/copy v1.14.0 diff --git a/go.sum b/go.sum index 3c054693..15cbadc6 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9 h1:JhybsG9MteMDlkXH/dHsENUon/8l9KY7VUc6F4XOOts= github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9/go.mod h1:uf0jIGyZGHi3oTfhg+QkwCkyTaGhtAwromhwouC1FhU= +github.com/beevik/etree v1.4.0 h1:oz1UedHRepuY3p4N5OjE0nK1WLCqtzHf25bxplKOHLs= +github.com/beevik/etree v1.4.0/go.mod h1:cyWiXwGoasx60gHvtnEh5x8+uIjUVnjWqBvEnhnqKDA= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= diff --git a/hack/sampledata/xsd/empty_premis.xml b/hack/sampledata/xsd/empty_premis.xml new file mode 100644 index 00000000..a562f308 --- /dev/null +++ b/hack/sampledata/xsd/empty_premis.xml @@ -0,0 +1,5 @@ + + + diff --git a/internal/activities/combine_premis.go b/internal/activities/combine_premis.go new file mode 100644 index 00000000..33f8018d --- /dev/null +++ b/internal/activities/combine_premis.go @@ -0,0 +1,138 @@ +package activities + +import ( + etree "github.com/beevik/etree" + "context" + "errors" + "os" + "path" + "strings" +) + +const CombinePREMISName = "combine-premis" + +type CombinePREMISActivity struct{} + +func NewCombinePREMISActivity() *CombinePREMISActivity { + return &CombinePREMISActivity{} +} + +type CombinePREMISParams struct { + Path string +} + +type CombinePREMISResult struct { + Out string +} + +func (md *CombinePREMISActivity) Execute( + ctx context.Context, + params *CombinePREMISParams, +) (*CombinePREMISResult, error) { + // Get transfer's PREMIS file paths + file_paths, err := CombinePREMISGetPaths(params.Path) + if err != nil { + return nil, err + } + + // Write elements from transfer's PREMIS files to combined PREMIS file + for _, file_path := range file_paths { + err := CombinePREMISCopy(file_path, "metadata/premis.xml") + if err != nil { + return nil, err + } + } + + res := &CombinePREMISResult{} + res.Out = "OK" + return res, nil +} + +func CombinePREMISGetPaths(transfer_dir string) ([]string, error) { + content_subdir := "content" + file_dir := path.Join(transfer_dir, content_subdir) + + dir_items, err := os.ReadDir(file_dir) + if err != nil { + return nil, err + } + + file_paths := []string{} + for _, dir_item := range dir_items { + if dir_item.IsDir() { + subdir := path.Join(transfer_dir, content_subdir, dir_item.Name()) + + sub_items, err := os.ReadDir(subdir) + if err != nil { + return nil, err + } + + for _, subdir_item := range sub_items { + if !subdir_item.IsDir() { + if strings.HasSuffix(strings.ToLower(subdir_item.Name()), "_premis.xml") { + file_paths = append(file_paths, path.Join(subdir, subdir_item.Name())) + } + } + } + } + } + + return file_paths, nil +} + +func CombinePREMISCopy(source_filepath string, destination_filepath string) error { + // Parse source document and get root PREMIS element + source_doc := etree.NewDocument() + + if err := source_doc.ReadFromFile(source_filepath); err != nil { + return err + } + + source_premis_element := source_doc.FindElement("/premis") + if source_premis_element == nil { + return errors.New("no root premis element found in source document") + } + + // Read source child PREMIS elements + source_premis_object_elements := source_premis_element.FindElements("object") + source_premis_event_elements := source_premis_element.FindElements("event") + source_premis_agent_elements := source_premis_element.FindElements("agent") + + // Parse destination document and get root PREMIS element + dest_doc := etree.NewDocument() + if err := dest_doc.ReadFromFile(destination_filepath); err != nil { + return err + } + + dest_premis_element := dest_doc.FindElement("/premis") + if dest_premis_element == nil { + return errors.New("no root premis element found in destination document") + } + + // Update PREMIS originalname child elements of PREMIS object elements + for _, premis_object_element := range source_premis_object_elements { + objectname_element := premis_object_element.FindElement("originalName") + if objectname_element != nil { + objectname_element.SetText("data/" + objectname_element.Text()) + } + } + + // Write destination child PREMIS elements + CombinePREMISAddChildElements(dest_premis_element, source_premis_object_elements) + CombinePREMISAddChildElements(dest_premis_element, source_premis_event_elements) + CombinePREMISAddChildElements(dest_premis_element, source_premis_agent_elements) + + dest_doc.Indent(2) + err := dest_doc.WriteToFile(destination_filepath) + if err != nil { + return err + } + + return nil +} + +func CombinePREMISAddChildElements(parent_element *etree.Element, new_child_elements []*etree.Element) { + for _, child_element := range new_child_elements { + parent_element.AddChild(child_element) + } +} diff --git a/internal/workflow/preprocessing.go b/internal/workflow/preprocessing.go index 8ac89e28..e529f9d0 100644 --- a/internal/workflow/preprocessing.go +++ b/internal/workflow/preprocessing.go @@ -132,6 +132,17 @@ func (w *PreprocessingWorkflow) Execute( return nil, e } + // Combine PREMIS files into one. + var combinePREMIS activities.CombinePREMISResult + e = temporalsdk_workflow.ExecuteActivity( + withLocalActOpts(ctx), + activities.CombinePREMISName, + &activities.CombinePREMISParams{Path: localPath}, + ).Get(ctx, &combinePREMIS) + if e != nil { + return nil, e + } + // Remove PREMIS XML files. var removeFilesResult removefiles.ActivityResult e = temporalsdk_workflow.ExecuteActivity( diff --git a/internal/workflow/preprocessing_test.go b/internal/workflow/preprocessing_test.go index d42c81ba..edfba84b 100644 --- a/internal/workflow/preprocessing_test.go +++ b/internal/workflow/preprocessing_test.go @@ -52,6 +52,10 @@ func (s *PreprocessingTestSuite) SetupTest(cfg config.Configuration) { activities.NewTransformVecteurAIPActivity().Execute, temporalsdk_activity.RegisterOptions{Name: activities.TransformVecteurAIPName}, ) + s.env.RegisterActivityWithOptions( + activities.NewCombinePREMISActivity().Execute, + temporalsdk_activity.RegisterOptions{Name: activities.CombinePREMISName}, + ) s.env.RegisterActivityWithOptions( removefiles.NewActivity(removefiles.Config{}).Execute, temporalsdk_activity.RegisterOptions{Name: removefiles.ActivityName}, @@ -174,6 +178,13 @@ func (s *PreprocessingTestSuite) TestVecteurAIP() { ).Return( &activities.TransformVecteurAIPResult{}, nil, ) + s.env.OnActivity( + activities.CombinePREMISName, + sessionCtx, + &activities.CombinePREMISParams{Path: sipPath}, + ).Return( + &activities.CombinePREMISResult{}, nil, + ) s.env.OnActivity( removefiles.ActivityName, sessionCtx,