From a1ae576b074b672fe003f5bf18f155d20d7b85e7 Mon Sep 17 00:00:00 2001 From: Mike Cantelon Date: Fri, 17 May 2024 14:23:38 +0200 Subject: [PATCH 1/2] Combine PREMIS files from Vecteur AIPs --- cmd/worker/workercmd/cmd.go | 4 + go.mod | 1 + go.sum | 2 + hack/sampledata/xsd/empty_premis.xml | 5 + internal/activities/combine_premis.go | 152 ++++++++++++++++++++++++ internal/workflow/preprocessing.go | 11 ++ internal/workflow/preprocessing_test.go | 11 ++ 7 files changed, 186 insertions(+) create mode 100644 hack/sampledata/xsd/empty_premis.xml create mode 100644 internal/activities/combine_premis.go diff --git a/cmd/worker/workercmd/cmd.go b/cmd/worker/workercmd/cmd.go index 80b0ce19..9dd49497 100644 --- a/cmd/worker/workercmd/cmd.go +++ b/cmd/worker/workercmd/cmd.go @@ -71,6 +71,10 @@ func (m *Main) Run(ctx context.Context) error { activities.NewMetadataValidationActivity().Execute, temporalsdk_activity.RegisterOptions{Name: activities.MetadataValidationName}, ) + w.RegisterActivityWithOptions( + activities.NewCombinePREMISActivity().Execute, + temporalsdk_activity.RegisterOptions{Name: activities.CombinePREMISName}, + ) w.RegisterActivityWithOptions( activities.NewSipCreationActivity().Execute, temporalsdk_activity.RegisterOptions{Name: activities.SipCreationName}, diff --git a/go.mod b/go.mod index 9a666546..724b1acf 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.22.3 require ( github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9 + github.com/beevik/etree v1.4.0 github.com/go-logr/logr v1.4.1 github.com/nyudlts/go-bagit v0.3.0-alpha github.com/otiai10/copy v1.14.0 diff --git a/go.sum b/go.sum index 3c054693..15cbadc6 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9 h1:JhybsG9MteMDlkXH/dHsENUon/8l9KY7VUc6F4XOOts= github.com/artefactual-sdps/temporal-activities v0.0.0-20240513093038-77e9f8382ca9/go.mod h1:uf0jIGyZGHi3oTfhg+QkwCkyTaGhtAwromhwouC1FhU= +github.com/beevik/etree v1.4.0 h1:oz1UedHRepuY3p4N5OjE0nK1WLCqtzHf25bxplKOHLs= +github.com/beevik/etree v1.4.0/go.mod h1:cyWiXwGoasx60gHvtnEh5x8+uIjUVnjWqBvEnhnqKDA= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= diff --git a/hack/sampledata/xsd/empty_premis.xml b/hack/sampledata/xsd/empty_premis.xml new file mode 100644 index 00000000..373b780f --- /dev/null +++ b/hack/sampledata/xsd/empty_premis.xml @@ -0,0 +1,5 @@ + + + diff --git a/internal/activities/combine_premis.go b/internal/activities/combine_premis.go new file mode 100644 index 00000000..830793a9 --- /dev/null +++ b/internal/activities/combine_premis.go @@ -0,0 +1,152 @@ +package activities + +import ( + "context" + "errors" + "os" + "path" + "strings" + + "github.com/beevik/etree" + "github.com/otiai10/copy" +) + +const CombinePREMISName = "combine-premis" + +type CombinePREMISActivity struct{} + +func NewCombinePREMISActivity() *CombinePREMISActivity { + return &CombinePREMISActivity{} +} + +type CombinePREMISParams struct { + Path string +} + +type CombinePREMISResult struct { + Out string +} + +func (md *CombinePREMISActivity) Execute( + ctx context.Context, + params *CombinePREMISParams, +) (*CombinePREMISResult, error) { + // Get transfer's PREMIS file paths. + file_paths, err := CombinePREMISGetPaths(params.Path) + if err != nil { + return nil, err + } + + // Copy empty PREMIS file into metadata directory. + source_filepath := "empty_premis.xml" + dest_filepath := path.Join(params.Path, "metadata/premis.xml") + + err = copy.Copy(source_filepath, dest_filepath) + if err != nil { + return nil, err + } + + // Write elements from transfer's PREMIS files to combined PREMIS file. + combined_premis_filepath := path.Join(params.Path, "metadata/premis.xml") + + for _, file_path := range file_paths { + err := CombinePREMISCopy(file_path, combined_premis_filepath) + if err != nil { + return nil, err + } + } + + res := &CombinePREMISResult{} + res.Out = "OK" + return res, nil +} + +func CombinePREMISGetPaths(transfer_dir string) ([]string, error) { + dir_items, err := os.ReadDir(transfer_dir) + if err != nil { + return nil, err + } + + file_paths := []string{} + for _, dir_item := range dir_items { + if dir_item.IsDir() { + subdir := path.Join(transfer_dir, dir_item.Name()) + + sub_items, err := os.ReadDir(subdir) + if err != nil { + return nil, err + } + + for _, subdir_item := range sub_items { + if !subdir_item.IsDir() { + if strings.HasSuffix(strings.ToLower(subdir_item.Name()), "_premis.xml") { + file_paths = append(file_paths, path.Join(subdir, subdir_item.Name())) + } + } + } + } + } + + return file_paths, nil +} + +func CombinePREMISCopy(source_filepath string, destination_filepath string) error { + // Parse source document and get root PREMIS element. + source_doc := etree.NewDocument() + + if err := source_doc.ReadFromFile(source_filepath); err != nil { + return err + } + + source_premis_element := source_doc.FindElement("/premis") + if source_premis_element == nil { + return errors.New("no root premis element found in source document") + } + + // Read source child PREMIS elements. + source_premis_object_elements := source_premis_element.FindElements("object") + source_premis_event_elements := source_premis_element.FindElements("event") + source_premis_agent_elements := source_premis_element.FindElements("agent") + + // Parse destination document and get root PREMIS element. + dest_doc := etree.NewDocument() + if err := dest_doc.ReadFromFile(destination_filepath); err != nil { + return err + } + + dest_premis_element := dest_doc.FindElement("/premis") + if dest_premis_element == nil { + return errors.New("no root premis element found in destination document") + } + + // Update PREMIS originalname child elements of PREMIS object elements. + for _, premis_object_element := range source_premis_object_elements { + objectname_element := premis_object_element.FindElement("originalName") + if objectname_element != nil { + objectname_element.SetText("data/" + objectname_element.Text()) + } + } + + // Write destination child PREMIS elements. + CombinePREMISAddChildElements(dest_premis_element, source_premis_object_elements) + CombinePREMISAddChildElements(dest_premis_element, source_premis_event_elements) + CombinePREMISAddChildElements(dest_premis_element, source_premis_agent_elements) + + dest_doc.Indent(2) + err := dest_doc.WriteToFile(destination_filepath) + if err != nil { + return err + } + + return nil +} + +func CombinePREMISAddChildElements(parent_element *etree.Element, new_child_elements []*etree.Element) { + for _, child_element := range new_child_elements { + child_element.Space = "premis" + for _, element := range child_element.FindElements("//*") { + element.Space = "premis" + } + parent_element.AddChild(child_element) + } +} diff --git a/internal/workflow/preprocessing.go b/internal/workflow/preprocessing.go index 8ac89e28..e529f9d0 100644 --- a/internal/workflow/preprocessing.go +++ b/internal/workflow/preprocessing.go @@ -132,6 +132,17 @@ func (w *PreprocessingWorkflow) Execute( return nil, e } + // Combine PREMIS files into one. + var combinePREMIS activities.CombinePREMISResult + e = temporalsdk_workflow.ExecuteActivity( + withLocalActOpts(ctx), + activities.CombinePREMISName, + &activities.CombinePREMISParams{Path: localPath}, + ).Get(ctx, &combinePREMIS) + if e != nil { + return nil, e + } + // Remove PREMIS XML files. var removeFilesResult removefiles.ActivityResult e = temporalsdk_workflow.ExecuteActivity( diff --git a/internal/workflow/preprocessing_test.go b/internal/workflow/preprocessing_test.go index d42c81ba..edfba84b 100644 --- a/internal/workflow/preprocessing_test.go +++ b/internal/workflow/preprocessing_test.go @@ -52,6 +52,10 @@ func (s *PreprocessingTestSuite) SetupTest(cfg config.Configuration) { activities.NewTransformVecteurAIPActivity().Execute, temporalsdk_activity.RegisterOptions{Name: activities.TransformVecteurAIPName}, ) + s.env.RegisterActivityWithOptions( + activities.NewCombinePREMISActivity().Execute, + temporalsdk_activity.RegisterOptions{Name: activities.CombinePREMISName}, + ) s.env.RegisterActivityWithOptions( removefiles.NewActivity(removefiles.Config{}).Execute, temporalsdk_activity.RegisterOptions{Name: removefiles.ActivityName}, @@ -174,6 +178,13 @@ func (s *PreprocessingTestSuite) TestVecteurAIP() { ).Return( &activities.TransformVecteurAIPResult{}, nil, ) + s.env.OnActivity( + activities.CombinePREMISName, + sessionCtx, + &activities.CombinePREMISParams{Path: sipPath}, + ).Return( + &activities.CombinePREMISResult{}, nil, + ) s.env.OnActivity( removefiles.ActivityName, sessionCtx, From 0ce0188ec78e43ed916fdde668138ba7afbc46e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Raddaoui=20Mar=C3=ADn?= Date: Fri, 17 May 2024 14:32:11 +0200 Subject: [PATCH 2/2] Create objects dir and change paths in premis.xml [skip-codecov] --- internal/activities/combine_premis.go | 15 +++++++++------ internal/activities/transform_vecteur_aip.go | 8 +++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/internal/activities/combine_premis.go b/internal/activities/combine_premis.go index 830793a9..203292df 100644 --- a/internal/activities/combine_premis.go +++ b/internal/activities/combine_premis.go @@ -5,6 +5,7 @@ import ( "errors" "os" "path" + "path/filepath" "strings" "github.com/beevik/etree" @@ -37,7 +38,7 @@ func (md *CombinePREMISActivity) Execute( return nil, err } - // Copy empty PREMIS file into metadata directory. + // Copy empty PREMIS file into metadata directory. source_filepath := "empty_premis.xml" dest_filepath := path.Join(params.Path, "metadata/premis.xml") @@ -51,7 +52,7 @@ func (md *CombinePREMISActivity) Execute( for _, file_path := range file_paths { err := CombinePREMISCopy(file_path, combined_premis_filepath) - if err != nil { + if err != nil { return nil, err } } @@ -62,7 +63,8 @@ func (md *CombinePREMISActivity) Execute( } func CombinePREMISGetPaths(transfer_dir string) ([]string, error) { - dir_items, err := os.ReadDir(transfer_dir) + objects_dir := filepath.Join(transfer_dir, "objects") + dir_items, err := os.ReadDir(objects_dir) if err != nil { return nil, err } @@ -70,7 +72,7 @@ func CombinePREMISGetPaths(transfer_dir string) ([]string, error) { file_paths := []string{} for _, dir_item := range dir_items { if dir_item.IsDir() { - subdir := path.Join(transfer_dir, dir_item.Name()) + subdir := path.Join(objects_dir, dir_item.Name()) sub_items, err := os.ReadDir(subdir) if err != nil { @@ -90,7 +92,7 @@ func CombinePREMISGetPaths(transfer_dir string) ([]string, error) { return file_paths, nil } -func CombinePREMISCopy(source_filepath string, destination_filepath string) error { +func CombinePREMISCopy(source_filepath, destination_filepath string) error { // Parse source document and get root PREMIS element. source_doc := etree.NewDocument() @@ -120,10 +122,11 @@ func CombinePREMISCopy(source_filepath string, destination_filepath string) erro } // Update PREMIS originalname child elements of PREMIS object elements. + dirName := filepath.Base(filepath.Dir(source_filepath)) for _, premis_object_element := range source_premis_object_elements { objectname_element := premis_object_element.FindElement("originalName") if objectname_element != nil { - objectname_element.SetText("data/" + objectname_element.Text()) + objectname_element.SetText("objects/" + dirName + "/" + objectname_element.Text()) } } diff --git a/internal/activities/transform_vecteur_aip.go b/internal/activities/transform_vecteur_aip.go index 0afb847f..c44fee61 100644 --- a/internal/activities/transform_vecteur_aip.go +++ b/internal/activities/transform_vecteur_aip.go @@ -57,13 +57,19 @@ func (a *TransformVecteurAIPActivity) Execute( return nil, err } + // Create objects directory. + objectsPath := filepath.Join(params.Path, "objects") + if err = os.Mkdir(objectsPath, 0o750); err != nil { + return nil, err + } + // Move all entries from content/content to root folder. entries, err := os.ReadDir(contentPath) if err != nil { return nil, err } for _, entry := range entries { - err := fsutil.Move(filepath.Join(contentPath, entry.Name()), filepath.Join(params.Path, entry.Name())) + err := fsutil.Move(filepath.Join(contentPath, entry.Name()), filepath.Join(objectsPath, entry.Name())) if err != nil { return nil, err }