diff --git a/.pylintrc b/.pylintrc index b94107e54..120652d88 100644 --- a/.pylintrc +++ b/.pylintrc @@ -6,6 +6,7 @@ ignored-modules=cv2,tesserocr,ocrd.model ignore-patterns='.*generateds.*' disable = useless-object-inheritance, + too-many-instance-attributes, inconsistent-return-statements, ungrouped-imports, missing-docstring, diff --git a/ocrd/constants.py b/ocrd/constants.py index 4c35a995a..df1085e08 100644 --- a/ocrd/constants.py +++ b/ocrd/constants.py @@ -25,6 +25,8 @@ TAG_METS_FILE = '{%s}file' % NAMESPACES['mets'] TAG_METS_FLOCAT = '{%s}FLocat' % NAMESPACES['mets'] TAG_METS_FILEGRP = '{%s}fileGrp' % NAMESPACES['mets'] +TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets'] +TAG_METS_NAME = '{%s}name' % NAMESPACES['mets'] TAG_MODS_IDENTIFIER = '{%s}identifier' % NAMESPACES['mods'] diff --git a/ocrd/model/__init__.py b/ocrd/model/__init__.py index ca206a55b..7880e6988 100644 --- a/ocrd/model/__init__.py +++ b/ocrd/model/__init__.py @@ -3,3 +3,4 @@ from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile from .ocrd_swagger import OcrdSwagger +from .ocrd_agent import OcrdAgent diff --git a/ocrd/model/ocrd_agent.py b/ocrd/model/ocrd_agent.py new file mode 100644 index 000000000..c4eb62fbb --- /dev/null +++ b/ocrd/model/ocrd_agent.py @@ -0,0 +1,88 @@ +# import os +from ocrd.constants import NAMESPACES as NS, TAG_METS_AGENT, TAG_METS_NAME + +from .ocrd_xml_base import ET + +class OcrdAgent(object): + """ + Represents a + """ + + # @staticmethod + # from_el(el): + # role = el_agent.get('ROLE') + # _type = el_agent.get('TYPE') + # otherrole = el_agent.get('OTHERROLE') + # name_parts = string.split(el.find('mets:name', NS).text, ' ', 2) + # # name = name_parts[0] + # # version = name_parts[1][1:] # v0.0.1 => 0.0.1 + # return OcrdAgent(el, name, role, _type, otherrole) + + def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, otherrole=None): + if el is None: + el = ET.Element(TAG_METS_AGENT) + self._el = el + self.name = name + self.type = _type + self.othertype = othertype + self.role = role + self.otherrole = otherrole + + def __str__(self): + props = ', '.join([ + '='.join([k, getattr(self, k) if getattr(self, k) else '---']) + for k in ['type', 'othertype', 'role', 'otherrole', 'name'] + ]) + return ' ' + + @property + def type(self): + return self._el.get('TYPE') + + @type.setter + def type(self, _type): + if _type is not None: + self._el.set('TYPE', _type) + + @property + def othertype(self): + return self._el.get('OTHERTYPE') + + @othertype.setter + def othertype(self, othertype): + if othertype is not None: + self._el.set('TYPE', 'OTHER') + self._el.set('OTHERTYPE', othertype) + + @property + def role(self): + return self._el.get('ROLE') + + @role.setter + def role(self, role): + if role is not None: + self._el.set('ROLE', role) + + @property + def otherrole(self): + return self._el.get('OTHERROLE') + + @otherrole.setter + def otherrole(self, otherrole): + if otherrole is not None: + self._el.set('ROLE', 'OTHER') + self._el.set('OTHERROLE', otherrole) + + @property + def name(self): + el_name = self._el.find('mets:name', NS) + if el_name is not None: + return el_name.text + + @name.setter + def name(self, name): + if name is not None: + el_name = self._el.find('mets:name', NS) + if el_name is None: + el_name = ET.SubElement(self._el, TAG_METS_NAME) + el_name.text = name diff --git a/ocrd/model/ocrd_mets.py b/ocrd/model/ocrd_mets.py index 9fdf46c7c..52a0e4b7c 100644 --- a/ocrd/model/ocrd_mets.py +++ b/ocrd/model/ocrd_mets.py @@ -4,6 +4,7 @@ NAMESPACES as NS, TAG_METS_FILE, TAG_METS_FILEGRP, + TAG_METS_AGENT, IDENTIFIER_PRIORITY, TAG_MODS_IDENTIFIER, METS_XML_EMPTY, @@ -12,6 +13,7 @@ from .ocrd_xml_base import OcrdXmlDocument, ET from .ocrd_file import OcrdFile +from .ocrd_agent import OcrdAgent class OcrdMets(OcrdXmlDocument): @@ -49,6 +51,20 @@ def unique_identifier(self, purl): id_el.set('type', 'purl') id_el.text = purl + @property + def agents(self): + return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('.//mets:metsHdr/mets:agent', NS)] + + def add_agent(self, *args, **kwargs): + """ + Add an agent to the list of agents in the metsHdr. + """ + el_metsHdr = self._tree.getroot().find('.//mets:metsHdr', NS) + # assert(el_metsHdr is not None) + el_agent = ET.SubElement(el_metsHdr, TAG_METS_AGENT) + # print(ET.tostring(el_metsHdr)) + return OcrdAgent(el_agent, *args, **kwargs) + @property def file_groups(self): return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] diff --git a/ocrd/model/ocrd_page.py b/ocrd/model/ocrd_page.py index 361e4b1bb..9ba42e778 100644 --- a/ocrd/model/ocrd_page.py +++ b/ocrd/model/ocrd_page.py @@ -58,7 +58,6 @@ def from_file(input_file): # print("PARSING PARSING '%s'" % input_file) if input_file.mimetype.startswith('image'): return page_from_image(input_file) - elif input_file.mimetype == MIMETYPE_PAGE: + if input_file.mimetype == MIMETYPE_PAGE: return parse(input_file.local_filename, silence=True) - else: - raise Exception("Unsupported mimetype '%s'" % input_file.mimetype) + raise Exception("Unsupported mimetype '%s'" % input_file.mimetype) diff --git a/ocrd/processor/base.py b/ocrd/processor/base.py index 884a90f68..4968efb0e 100644 --- a/ocrd/processor/base.py +++ b/ocrd/processor/base.py @@ -28,7 +28,7 @@ def run_processor( output_file_grp=None, parameter=None, working_dir=None, -): +): # pylint: disable=too-many-locals """ Create a workspace for mets_url and run processor through it @@ -59,8 +59,18 @@ def run_processor( output_file_grp=output_file_grp, parameter=parameter ) - log.debug("Processor instance %s", processor) + # print(processor.version) + name = '%s v%s' % (ocrd_tool['executable'], processor.version) + otherrole = ocrd_tool['steps'][0] + log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) processor.process() + workspace.mets.add_agent( + name=name, + _type='OTHER', + othertype='SOFTWARE', + role='OTHER', + otherrole=otherrole + ) workspace.save_mets() # TODO not used as of 0.8.2 @@ -75,7 +85,7 @@ def run_cli( output_file_grp=None, parameter=None, working_dir=None, -): +): # pylint: disable=unused-argument """ Create a workspace for mets_url and run MP CLI through it """ @@ -98,13 +108,15 @@ def __init__( self, workspace, ocrd_tool=None, - parameter={}, + parameter=None, input_file_grp="INPUT", output_file_grp="OUTPUT", group_id=None, dump_json=False, version=None ): + if parameter is None: + parameter = {} if dump_json: print(json.dumps(ocrd_tool, indent=True)) return diff --git a/ocrd/resolver.py b/ocrd/resolver.py index 1998fb1ab..fac36b077 100644 --- a/ocrd/resolver.py +++ b/ocrd/resolver.py @@ -4,7 +4,7 @@ import tempfile import requests -from ocrd.constants import TMP_PREFIX, EXT_TO_MIME +from ocrd.constants import TMP_PREFIX from ocrd.utils import getLogger, safe_filename from ocrd.workspace import Workspace from ocrd.model import OcrdMets @@ -226,4 +226,4 @@ def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_me log.info("Writing %s", mets_fpath) fmets.write(mets.to_xml(xmllint=True)) - return Workspace(self, directory, mets) \ No newline at end of file + return Workspace(self, directory, mets) diff --git a/ocrd/workspace.py b/ocrd/workspace.py index d292e153c..b6b524e6f 100644 --- a/ocrd/workspace.py +++ b/ocrd/workspace.py @@ -173,13 +173,12 @@ def resolve_image_as_pil(self, image_url, coords=None): if coords is None: return pil_image - else: - if image_url not in self.image_cache['cv2']: - self.image_cache['cv2'][image_url] = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) - cv2_image = self.image_cache['cv2'][image_url] - poly = np.array(coords, np.int32) - region_cut = cv2_image[ - np.min(poly[:, 1]):np.max(poly[:, 1]), - np.min(poly[:, 0]):np.max(poly[:, 0]) - ] - return Image.fromarray(region_cut) + if image_url not in self.image_cache['cv2']: + self.image_cache['cv2'][image_url] = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) + cv2_image = self.image_cache['cv2'][image_url] + poly = np.array(coords, np.int32) + region_cut = cv2_image[ + np.min(poly[:, 1]):np.max(poly[:, 1]), + np.min(poly[:, 0]):np.max(poly[:, 0]) + ] + return Image.fromarray(region_cut) diff --git a/setup.py b/setup.py index 605ae3ad3..8571ea1b8 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ 'Flask', 'Pillow', 'click', - 'click >=7<8', + 'click >=7', 'jsonschema', 'lxml', 'numpy', diff --git a/test/model/test_agent.py b/test/model/test_agent.py new file mode 100644 index 000000000..1d1f4ce7c --- /dev/null +++ b/test/model/test_agent.py @@ -0,0 +1,29 @@ +from test.base import TestCase, assets, main # pylint: disable=unused-import + +from ocrd.model import OcrdAgent + +# pylint: disable=no-member +class TestOcrdAgent(TestCase): + + def test_basic1(self): + ag = OcrdAgent(role='FOO') + self.assertEqual(ag.role, 'FOO') + + def test_basic2(self): + ag = OcrdAgent(otherrole='BAR') + self.assertEqual(ag.role, 'OTHER') + self.assertEqual(ag.otherrole, 'BAR') + + def test_basic3(self): + ag = OcrdAgent(name='foobar') + self.assertEqual(ag.name, 'foobar') + ag.name = 'barfoo' + self.assertEqual(ag.name, 'barfoo') + + def test_basic4(self): + ag = OcrdAgent(othertype='foobar') + self.assertEqual(ag.type, 'OTHER') + # print(ag) + +if __name__ == '__main__': + main() diff --git a/test/model/test_ocrd_mets.py b/test/model/test_ocrd_mets.py index b91fe2ee4..a3b7260d0 100644 --- a/test/model/test_ocrd_mets.py +++ b/test/model/test_ocrd_mets.py @@ -3,6 +3,7 @@ from ocrd.constants import MIMETYPE_PAGE, VERSION from ocrd.model import OcrdMets +# pylint: disable=protected-access class TestOcrdMets(TestCase): def setUp(self): @@ -68,5 +69,13 @@ def test_file_groupid(self): f.groupId = 'foo' self.assertEqual(f.groupId, 'foo') + def test_agent(self): + # Processor(workspace=self.workspace) + mets = self.mets + self.assertEqual(len(mets.agents), 1) + mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') + # print(['%s'%x for x in mets.agents]) + self.assertEqual(len(mets.agents), 2) + if __name__ == '__main__': main() diff --git a/test/processor/test_processor.py b/test/processor/test_processor.py index 3fb3e7a32..e2b4e16b8 100644 --- a/test/processor/test_processor.py +++ b/test/processor/test_processor.py @@ -1,17 +1,21 @@ +from test.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module + from ocrd.resolver import Resolver -from ocrd.processor.base import Processor +from ocrd.processor.base import Processor, run_processor -from test.base import TestCase, assets, main +DUMMY_TOOL = {'executable': 'ocrd-test', 'steps': ['recognition/post-correction']} class DummyProcessor(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = { - 'executable': 'ocrd-test' - } + kwargs['ocrd_tool'] = DUMMY_TOOL kwargs['version'] = '0.0.1' super(DummyProcessor, self).__init__(*args, **kwargs) + def process(self): + # print('# nope') + pass + class TestResolver(TestCase): def setUp(self): @@ -20,12 +24,20 @@ def setUp(self): def test_verify(self): proc = DummyProcessor(self.workspace) - self.assertEquals(proc.verify(), True) + self.assertEqual(proc.verify(), True) def test_json(self): DummyProcessor(self.workspace, dump_json=True) def test_params(self): proc = Processor(workspace=self.workspace) - self.assertEquals(proc.parameter, {}) + self.assertEqual(proc.parameter, {}) + + def test_run_agent(self): + no_agents_before = len(self.workspace.mets.agents) + run_processor(DummyProcessor, ocrd_tool=DUMMY_TOOL, workspace=self.workspace) + self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent') + print(self.workspace.mets.agents[no_agents_before]) +if __name__ == "__main__": + main()