OCR-D · kba · Oct 21, 2022 · May 13, 2022 · May 13, 2022 · May 13, 2022
diff --git a/ocrd_models/ocrd_models/ocrd_agent.py b/ocrd_models/ocrd_models/ocrd_agent.py
@@ -149,11 +149,11 @@ def notes(self, notes):
         """
         Set the ``mets:note`` element values.
         """
-        el_notes = self._el.findall(TAG_METS_NOTE)
-        if el_notes:
-            for el_note in el_notes:
-                self._el.remove(el_note)
-        if notes:
+        if notes is not None:
+            el_notes = self._el.findall(TAG_METS_NOTE)
+            if el_notes:
+                for el_note in el_notes:
+                    self._el.remove(el_note)
             for note in notes:
                 el_note = ET.SubElement(self._el, TAG_METS_NOTE, nsmap={'ocrd': NS['ocrd']})
                 attrib, text = note

diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py
@@ -62,25 +62,29 @@ def make_file_id(ocrd_file, output_file_grp):
     Derive a new file ID for an output file from an existing input file ``ocrd_file``
     and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
     If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
-    Else if ``ocrd_file``'s ID contains the input file's pageId, then merely append ``output_file_grp``.
-    Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
-    (as a fallback counter), and increment counter until there is no more ID conflict.
-    """
+    Else if ``ocrd_file``'s ID contains the input file's pageId, then merely append it to ``output_file_grp``.
+    Else if ``ocrd_file`` has a pageId at all, then merely append it to ``output_file_grp``.
+    Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp.
+    """
+    # considerations for this behaviour:
+    # - uniqueness (in spite of different METS and processor conventions)
+    # - predictability (i.e. output name can be anticipated from the input name)
+    # - stability (i.e. output at least as much sorted and consistent as the input)
+    # ... and all this in spite of --page-id selection and --overwrite
     ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
     if ret == ocrd_file.ID:
         if ocrd_file.pageId and ocrd_file.pageId in ocrd_file.ID:
             # still sufficiently unique
             ret = output_file_grp + '_' + ocrd_file.ID
+        elif ocrd_file.pageId:
+            ret = output_file_grp + '_' + ocrd_file.pageId
         else:
             ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
             try:
                 n = ids.index(ocrd_file.ID) + 1
             except ValueError:
                 n = len(ids)
             ret = concat_padded(output_file_grp, n)
-            while next(ocrd_file.mets.find_files(ID=ret), None):
-                n += 1
-                ret = concat_padded(output_file_grp, n)
     if not REGEX_FILE_ID.fullmatch(ret):
         ret = ret.replace(':', '_')
         ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)

diff --git a/tests/data/__init__.py b/tests/data/__init__.py
@@ -1,5 +1,7 @@
 import json
+import os
 from ocrd import Processor
+from ocrd_utils import make_file_id
 
 DUMMY_TOOL = {
     'executable': 'ocrd-test',
@@ -37,6 +39,24 @@ def __init__(self, *args, **kwargs):
         }
         super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs)
 
+class DummyProcessorWithOutput(Processor):
+
+    def __init__(self, *args, **kwargs):
+        kwargs['ocrd_tool'] = DUMMY_TOOL
+        kwargs['version'] = '0.0.1'
+        super().__init__(*args, **kwargs)
+
+    def process(self):
+        for input_file in self.input_files:
+            file_id = make_file_id(input_file, self.output_file_grp)
+            self.workspace.add_file(
+                ID=file_id,
+                file_grp=self.output_file_grp,
+                pageId=input_file.pageId,
+                mimetype=input_file.mimetype,
+                local_filename=os.path.join(self.output_file_grp, file_id),
+                content='CONTENT')
+
 class IncompleteProcessor(Processor):
     pass
 

diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
@@ -3,7 +3,7 @@
 from tempfile import TemporaryDirectory
 from os.path import join
 from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module
-from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL
+from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor, DUMMY_TOOL
 
 from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
 from ocrd.resolver import Resolver
@@ -80,6 +80,32 @@ def test_run_agent(self):
         self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent')
         #  print(self.workspace.mets.agents[no_agents_before])
 
+    def test_run_input(self):
+        run_processor(DummyProcessor, ocrd_tool=DUMMY_TOOL, workspace=self.workspace, 
+                      input_file_grp="OCR-D-IMG")
+        self.assertTrue(len(self.workspace.mets.agents) > 0)
+        self.assertTrue(len(self.workspace.mets.agents[-1].notes) > 0)
+        self.assertTrue(("input-file-grp", "OCR-D-IMG") in self.workspace.mets.agents[-1].notes)
+
+    def test_run_output(self):
+        run_processor(DummyProcessorWithOutput, ocrd_tool=DUMMY_TOOL, workspace=self.workspace, 
+                      input_file_grp="OCR-D-IMG",
+                      output_file_grp="OCR-D-OUT")
+        self.assertEqual(len(self.workspace.mets.find_all_files(fileGrp="OCR-D-OUT")), 3)
+
+    def test_run_output_exists(self):
+        with self.assertRaisesRegex(Exception, "File with ID='.*' already exists"):
+            run_processor(DummyProcessorWithOutput, ocrd_tool=DUMMY_TOOL, workspace=self.workspace, 
+                          input_file_grp="OCR-D-IMG",
+                          output_file_grp="OCR-D-SEG-PAGE")
+
+    def test_run_output_overwrite(self):
+        self.workspace.overwrite_mode = True
+        run_processor(DummyProcessorWithOutput, ocrd_tool=DUMMY_TOOL, workspace=self.workspace, 
+                      input_file_grp="OCR-D-IMG",
+                      output_file_grp="OCR-D-SEG-PAGE")
+        self.assertEqual(len(self.workspace.mets.find_all_files(fileGrp="OCR-D-SEG-PAGE")), 3)
+
     def test_run_cli(self):
         with TemporaryDirectory() as tempdir:
             run_processor(DummyProcessor, ocrd_tool=DUMMY_TOOL, workspace=self.workspace)

diff --git a/tests/test_workspace.py b/tests/test_workspace.py
@@ -79,6 +79,45 @@ def test_workspace_add_file(plain_workspace):
     assert exists(fpath)
 
 
+def test_workspace_add_file_overwrite(plain_workspace):
+    fpath = str(plain_workspace.directory / 'ID1.tif')
+
+    # act
+    plain_workspace.add_file(
+        'GRP',
+        ID='ID1',
+        mimetype='image/tiff',
+        content='CONTENT',
+        pageId='phys1',
+        local_filename=fpath)
+    with pytest.raises(Exception) as fn_exc:
+        plain_workspace.add_file(
+            'GRP',
+            ID='ID1',
+            mimetype='image/tiff',
+            content='CONTENT',
+            pageId='phys2',
+            local_filename=fpath)    
+    assert str(fn_exc.value) == "File with ID='ID1' already exists"
+    plain_workspace.add_file(
+        'GRP',
+        ID='ID1',
+        mimetype='image/tiff',
+        content='CONTENT',
+        pageId='phys2',
+        local_filename=fpath,
+        force=True)
+    f = plain_workspace.mets.find_all_files()[0]
+
+    # assert
+    assert f.ID == 'ID1'
+    assert f.mimetype == 'image/tiff'
+    assert f.url == fpath
+    assert f.local_filename == fpath
+    assert f.pageId == 'phys2'
+    assert exists(fpath)
+
+
 def test_workspace_add_file_basename_no_content(plain_workspace):
     plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', pageId=None)
     f = next(plain_workspace.mets.find_files())