Skip to content

Commit

Permalink
Merge branch 'patch-11'
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Oct 21, 2022
2 parents 71d295a + 931e848 commit ea064f9
Show file tree
Hide file tree
Showing 11 changed files with 207 additions and 81 deletions.
1 change: 1 addition & 0 deletions ocrd/ocrd/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,7 @@ def save_image_file(self, image,
local_filename=file_path,
mimetype=mimetype,
content=image_bytes.getvalue(),
is_alternative_image=True,
force=force)
log.info('created file ID: %s, file_grp: %s, path: %s',
file_id, file_grp, out.local_filename)
Expand Down
10 changes: 5 additions & 5 deletions ocrd_models/ocrd_models/ocrd_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,11 @@ def notes(self, notes):
"""
Set the ``mets:note`` element values.
"""
el_notes = self._el.findall(TAG_METS_NOTE)
if el_notes:
for el_note in el_notes:
self._el.remove(el_note)
if notes:
if notes is not None:
el_notes = self._el.findall(TAG_METS_NOTE)
if el_notes:
for el_note in el_notes:
self._el.remove(el_note)
for note in notes:
el_note = ET.SubElement(self._el, TAG_METS_NOTE, nsmap={'ocrd': NS['ocrd']})
attrib, text = note
Expand Down
34 changes: 18 additions & 16 deletions ocrd_models/ocrd_models/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,21 +310,21 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force
raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID)
if not REGEX_FILE_ID.fullmatch(fileGrp):
raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp)
el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS)
if el_fileGrp is None:
el_fileGrp = self.add_file_group(fileGrp)
mets_file = next(self.find_files(ID=ID), None)
if mets_file and not ignore:
if not force:
raise Exception("File with ID='%s' already exists" % ID)
mets_file.url = url
mets_file.mimetype = mimetype
mets_file.ID = ID
mets_file.pageId = pageId
mets_file.local_filename = local_filename
else:
kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v}
mets_file = OcrdFile(ET.SubElement(el_fileGrp, TAG_METS_FILE), mets=self, **kwargs)
log = getLogger('ocrd_models.ocrd_mets.add_file')
el_fileGrp = self.add_file_group(fileGrp)
if not ignore:
mets_file = next(self.find_files(ID=ID), None)
if mets_file:
if mets_file.fileGrp == fileGrp and \
mets_file.pageId == pageId and \
mets_file.mimetype == mimetype:
if not force:
raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set")
self.remove_file(ID=ID)
else:
raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate")
kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v}
mets_file = OcrdFile(ET.SubElement(el_fileGrp, TAG_METS_FILE), mets=self, **kwargs)

return mets_file

Expand Down Expand Up @@ -485,13 +485,14 @@ def remove_physical_page_fptr(self, fileId):
mets_div.remove(mets_fptr)
return ret

def merge(self, other_mets, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs):
def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs):
"""
Add all files from other_mets.
Accepts the same kwargs as :py:func:`find_files`
Keyword Args:
force (boolean): Whether to add_files with force (overwriting existing mets:file)
fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS
fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS
pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS
Expand All @@ -508,6 +509,7 @@ def merge(self, other_mets, fileGrp_mapping=None, fileId_mapping=None, pageId_ma
fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp),
mimetype=f_src.mimetype,
url=f_src.url,
force=force,
ID=fileId_mapping.get(f_src.ID, f_src.ID),
pageId=pageId_mapping.get(f_src.pageId, f_src.pageId))
# FIXME: merge metsHdr, amdSec, dmdSec as well
Expand Down
31 changes: 15 additions & 16 deletions ocrd_utils/ocrd_utils/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,25 +62,24 @@ def make_file_id(ocrd_file, output_file_grp):
Derive a new file ID for an output file from an existing input file ``ocrd_file``
and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
Else if ``ocrd_file``'s ID contains the input file's pageId, then merely append ``output_file_grp``.
Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
(as a fallback counter), and increment counter until there is no more ID conflict.
Else if ``ocrd_file`` has a ``pageId`` but it is not contained in the ``ocrd_file.ID``, concatenate ``output_file_grp`` and ``ocrd_file.pageId``.
Otherwise concatenate ``output_file_grp`` with the ``ocrd_file.ID``.
Note: ``make_file_id`` cannot guarantee that the ID is unique within an actual ``OcrdMets`` but will ensure uniqueness in a workflow with processors that consistently use ``make_file_id`` for ID generation.
Note: ``make_file_id`` generates page-specific IDs. For IDs referencing sub-page-level files or ``pc:AlternativeImage``, the output of ``make_file_id`` might need to be concatenated with a unique string, such as a growing index number.
"""
# considerations for this behaviour:
# - uniqueness (in spite of different METS and processor conventions)
# - predictability (i.e. output name can be anticipated from the input name)
# - stability (i.e. output at least as much sorted and consistent as the input)
# ... and all this in spite of --page-id selection and --overwrite
ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
if ret == ocrd_file.ID:
if ocrd_file.pageId and ocrd_file.pageId in ocrd_file.ID:
# still sufficiently unique
ret = output_file_grp + '_' + ocrd_file.ID
if ret == ocrd_file.ID and output_file_grp != ocrd_file.fileGrp:
if ocrd_file.pageId and ocrd_file.pageId not in ocrd_file.ID:
ret = output_file_grp + '_' + ocrd_file.pageId
else:
ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
try:
n = ids.index(ocrd_file.ID) + 1
except ValueError:
n = len(ids)
ret = concat_padded(output_file_grp, n)
while next(ocrd_file.mets.find_files(ID=ret), None):
n += 1
ret = concat_padded(output_file_grp, n)
ret = output_file_grp + '_' + ocrd_file.ID
if not REGEX_FILE_ID.fullmatch(ret):
ret = ret.replace(':', '_')
ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
Expand Down
24 changes: 19 additions & 5 deletions tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

from os.path import dirname, realpath
from os import chdir
from contextlib import contextmanager
import sys
import logging
import io
import collections
from unittest import TestCase as VanillaTestCase, skip, main as unittests_main
import pytest
from ocrd_utils import disableLogging, initLogging
from ocrd_utils import disableLogging, initLogging, getLogger
from ocrd_models import OcrdMets

from tests.assets import assets, copy_of_directory
Expand Down Expand Up @@ -63,7 +64,6 @@ def create_ocrd_file(*args, **kwargs):
mets = OcrdMets.empty_mets()
return mets.add_file(*args, **kwargs)


# import traceback
# import warnings
# def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
Expand All @@ -72,10 +72,11 @@ def create_ocrd_file(*args, **kwargs):
# log.write(warnings.formatwarning(message, category, filename, lineno, line))
# warnings.showwarning = warn_with_traceback

# https://stackoverflow.com/questions/37944111/python-rolling-log-to-a-variable
# Adapted from http://alanwsmith.com/capturing-python-log-output-in-a-variable

class FIFOIO(io.TextIOBase):
"""
https://stackoverflow.com/questions/37944111/python-rolling-log-to-a-variable
Adapted from http://alanwsmith.com/capturing-python-log-output-in-a-variable
"""
def __init__(self, size, *args):
self.maxsize = size
io.TextIOBase.__init__(self, *args)
Expand All @@ -93,4 +94,17 @@ def shrink(self):
x = self.deque.popleft()
size -= len(x)

@contextmanager
def capture_log(loggerName):
disableLogging()
initLogging()
log_capture_string = FIFOIO(1024)
ch = logging.StreamHandler(log_capture_string)
getLogger(loggerName).addHandler(ch)
getLogger(loggerName).setLevel('DEBUG')
try:
yield log_capture_string
finally:
log_capture_string.close()

sys.path.append(dirname(realpath(__file__)) + '/../ocrd')
22 changes: 22 additions & 0 deletions tests/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import os
from ocrd import Processor
from ocrd_utils import make_file_id

DUMMY_TOOL = {
'executable': 'ocrd-test',
Expand Down Expand Up @@ -37,6 +39,26 @@ def __init__(self, *args, **kwargs):
}
super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs)

class DummyProcessorWithOutput(Processor):

def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = DUMMY_TOOL
kwargs['version'] = '0.0.1'
super().__init__(*args, **kwargs)

def process(self):
# print([str(x) for x in self.input_files]
for input_file in self.input_files:
file_id = make_file_id(input_file, self.output_file_grp)
# print(input_file.ID, file_id)
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=os.path.join(self.output_file_grp, file_id),
content='CONTENT')

class IncompleteProcessor(Processor):
pass

Expand Down
50 changes: 35 additions & 15 deletions tests/model/test_ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@
from datetime import datetime

from os.path import join
from contextlib import contextmanager
import shutil
from logging import StreamHandler

from tests.base import (
main,
capture_log,
assets,
)

from ocrd_utils import (
initLogging,
disableLogging,
getLogger,
VERSION,
MIMETYPE_PAGE
)
Expand Down Expand Up @@ -50,13 +56,6 @@ def test_str():
assert str(mets) == 'OcrdMets[fileGrps=[],files=[]]'


@pytest.mark.xfail(reason='old test, was actually out-commented')
def test_override_constructor_args():
id2file = {'foo': {}}
mets = OcrdMets(id2file, content='<mets/>')
assert mets._file_by_id == id2file


def test_file_groups(sbb_sample_01):
assert len(sbb_sample_01.file_groups) == 17, '17 file groups shall be found'

Expand Down Expand Up @@ -116,12 +115,16 @@ def test_add_group():
assert len(mets.file_groups) == 1, '1 file groups'


def test_add_file():
def test_add_file0():
mets = OcrdMets.empty_mets()
assert len(mets.file_groups) == 0, '0 file groups'
assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 0, '0 files in "OUTPUT"'
f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar")
f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar")
# TODO unless pageId/mimetype/fileGrp match raises exception this won't work
# with pytest.raises(Exception) as exc:
# f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar")
# assert str(exc.value) == "Exception: File with pageId='foobar' already exists in fileGrp 'OUTPUTx'"
f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar", is_alternative_image=True)
assert f.pageId == 'foobar', 'pageId set'
assert len(mets.file_groups) == 1, '1 file groups'
assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 2, '2 files in "OUTPUT"'
Expand All @@ -137,15 +140,32 @@ def test_add_file():
def test_add_file_id_already_exists(sbb_sample_01):
f = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop")
assert f.ID == 'best-id-ever', "ID kept"
with pytest.raises(Exception) as exc:
with pytest.raises(FileExistsError) as exc:
sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep")

assert "File with ID='best-id-ever' already exists" in str(exc)
# Still fails because differing mimetypes
with pytest.raises(FileExistsError) as exc:
f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True)

# Works but is unwise, there are now two files with clashing ID in METS
f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True)
assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 2

f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True)
assert f._el == f2._el
# Works because fileGrp, mimetype and pageId(== None) match and force is set
f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True)

# Previous step removed duplicate mets:file
assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1

def test_add_file_nopageid_overwrite(sbb_sample_01: OcrdMets):
"""
Test that when adding files without pageId
"""
with capture_log('ocrd_models.ocrd_mets.add_file') as cap:
file1 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml")
with pytest.raises(FileExistsError):
file2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml", ignore=False, force=False)

@pytest.mark.xfail(reason='2x same ID is valid if ignore == True')
def test_add_file_ignore(sbb_sample_01: OcrdMets):
"""Behavior if ignore-Flag set to true:
delegate responsibility to overwrite existing files to user"""
Expand All @@ -157,7 +177,7 @@ def test_add_file_ignore(sbb_sample_01: OcrdMets):

# how many files inserted
the_files = list(sbb_sample_01.find_files(ID='best-id-ever'))
assert len(the_files) == 1
assert len(the_files) == 2


def test_add_file_id_invalid(sbb_sample_01):
Expand Down
4 changes: 2 additions & 2 deletions tests/processor/test_ocrd_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ def test_copies_ok(self):
output_files = workspace.mets.find_all_files(fileGrp='OUTPUT')
output_files.sort(key=lambda x: x.url)
print([str(s) for s in output_files])
self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif')
self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml')
self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_PHYS_0001.tif')
self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_PHYS_0001.xml')
self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID)
self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url)
self.assertEqual(len(output_files), 6)
Expand Down
Loading

0 comments on commit ea064f9

Please sign in to comment.