Skip to content

Commit

Permalink
Merge pull request #191 from kba/agent-on-save
Browse files Browse the repository at this point in the history
Add agent for processors on save
  • Loading branch information
kba authored Oct 23, 2018
2 parents 582f52b + 72ea857 commit 8b1d38b
Show file tree
Hide file tree
Showing 13 changed files with 195 additions and 27 deletions.
1 change: 1 addition & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ignored-modules=cv2,tesserocr,ocrd.model
ignore-patterns='.*generateds.*'
disable =
useless-object-inheritance,
too-many-instance-attributes,
inconsistent-return-statements,
ungrouped-imports,
missing-docstring,
Expand Down
2 changes: 2 additions & 0 deletions ocrd/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
TAG_METS_FILE = '{%s}file' % NAMESPACES['mets']
TAG_METS_FLOCAT = '{%s}FLocat' % NAMESPACES['mets']
TAG_METS_FILEGRP = '{%s}fileGrp' % NAMESPACES['mets']
TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets']
TAG_METS_NAME = '{%s}name' % NAMESPACES['mets']

TAG_MODS_IDENTIFIER = '{%s}identifier' % NAMESPACES['mods']

Expand Down
1 change: 1 addition & 0 deletions ocrd/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .ocrd_exif import OcrdExif
from .ocrd_file import OcrdFile
from .ocrd_swagger import OcrdSwagger
from .ocrd_agent import OcrdAgent
88 changes: 88 additions & 0 deletions ocrd/model/ocrd_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# import os
from ocrd.constants import NAMESPACES as NS, TAG_METS_AGENT, TAG_METS_NAME

from .ocrd_xml_base import ET

class OcrdAgent(object):
"""
Represents a <mets:agent>
"""

# @staticmethod
# from_el(el):
# role = el_agent.get('ROLE')
# _type = el_agent.get('TYPE')
# otherrole = el_agent.get('OTHERROLE')
# name_parts = string.split(el.find('mets:name', NS).text, ' ', 2)
# # name = name_parts[0]
# # version = name_parts[1][1:] # v0.0.1 => 0.0.1
# return OcrdAgent(el, name, role, _type, otherrole)

def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, otherrole=None):
if el is None:
el = ET.Element(TAG_METS_AGENT)
self._el = el
self.name = name
self.type = _type
self.othertype = othertype
self.role = role
self.otherrole = otherrole

def __str__(self):
props = ', '.join([
'='.join([k, getattr(self, k) if getattr(self, k) else '---'])
for k in ['type', 'othertype', 'role', 'otherrole', 'name']
])
return '<OcrdAgent [' + props + ']/> '

@property
def type(self):
return self._el.get('TYPE')

@type.setter
def type(self, _type):
if _type is not None:
self._el.set('TYPE', _type)

@property
def othertype(self):
return self._el.get('OTHERTYPE')

@othertype.setter
def othertype(self, othertype):
if othertype is not None:
self._el.set('TYPE', 'OTHER')
self._el.set('OTHERTYPE', othertype)

@property
def role(self):
return self._el.get('ROLE')

@role.setter
def role(self, role):
if role is not None:
self._el.set('ROLE', role)

@property
def otherrole(self):
return self._el.get('OTHERROLE')

@otherrole.setter
def otherrole(self, otherrole):
if otherrole is not None:
self._el.set('ROLE', 'OTHER')
self._el.set('OTHERROLE', otherrole)

@property
def name(self):
el_name = self._el.find('mets:name', NS)
if el_name is not None:
return el_name.text

@name.setter
def name(self, name):
if name is not None:
el_name = self._el.find('mets:name', NS)
if el_name is None:
el_name = ET.SubElement(self._el, TAG_METS_NAME)
el_name.text = name
16 changes: 16 additions & 0 deletions ocrd/model/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
NAMESPACES as NS,
TAG_METS_FILE,
TAG_METS_FILEGRP,
TAG_METS_AGENT,
IDENTIFIER_PRIORITY,
TAG_MODS_IDENTIFIER,
METS_XML_EMPTY,
Expand All @@ -12,6 +13,7 @@

from .ocrd_xml_base import OcrdXmlDocument, ET
from .ocrd_file import OcrdFile
from .ocrd_agent import OcrdAgent

class OcrdMets(OcrdXmlDocument):

Expand Down Expand Up @@ -49,6 +51,20 @@ def unique_identifier(self, purl):
id_el.set('type', 'purl')
id_el.text = purl

@property
def agents(self):
return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('.//mets:metsHdr/mets:agent', NS)]

def add_agent(self, *args, **kwargs):
"""
Add an agent to the list of agents in the metsHdr.
"""
el_metsHdr = self._tree.getroot().find('.//mets:metsHdr', NS)
# assert(el_metsHdr is not None)
el_agent = ET.SubElement(el_metsHdr, TAG_METS_AGENT)
# print(ET.tostring(el_metsHdr))
return OcrdAgent(el_agent, *args, **kwargs)

@property
def file_groups(self):
return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)]
Expand Down
5 changes: 2 additions & 3 deletions ocrd/model/ocrd_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def from_file(input_file):
# print("PARSING PARSING '%s'" % input_file)
if input_file.mimetype.startswith('image'):
return page_from_image(input_file)
elif input_file.mimetype == MIMETYPE_PAGE:
if input_file.mimetype == MIMETYPE_PAGE:
return parse(input_file.local_filename, silence=True)
else:
raise Exception("Unsupported mimetype '%s'" % input_file.mimetype)
raise Exception("Unsupported mimetype '%s'" % input_file.mimetype)
20 changes: 16 additions & 4 deletions ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def run_processor(
output_file_grp=None,
parameter=None,
working_dir=None,
):
): # pylint: disable=too-many-locals
"""
Create a workspace for mets_url and run processor through it
Expand Down Expand Up @@ -59,8 +59,18 @@ def run_processor(
output_file_grp=output_file_grp,
parameter=parameter
)
log.debug("Processor instance %s", processor)
# print(processor.version)
name = '%s v%s' % (ocrd_tool['executable'], processor.version)
otherrole = ocrd_tool['steps'][0]
log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
processor.process()
workspace.mets.add_agent(
name=name,
_type='OTHER',
othertype='SOFTWARE',
role='OTHER',
otherrole=otherrole
)
workspace.save_mets()

# TODO not used as of 0.8.2
Expand All @@ -75,7 +85,7 @@ def run_cli(
output_file_grp=None,
parameter=None,
working_dir=None,
):
): # pylint: disable=unused-argument
"""
Create a workspace for mets_url and run MP CLI through it
"""
Expand All @@ -98,13 +108,15 @@ def __init__(
self,
workspace,
ocrd_tool=None,
parameter={},
parameter=None,
input_file_grp="INPUT",
output_file_grp="OUTPUT",
group_id=None,
dump_json=False,
version=None
):
if parameter is None:
parameter = {}
if dump_json:
print(json.dumps(ocrd_tool, indent=True))
return
Expand Down
4 changes: 2 additions & 2 deletions ocrd/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tempfile
import requests

from ocrd.constants import TMP_PREFIX, EXT_TO_MIME
from ocrd.constants import TMP_PREFIX
from ocrd.utils import getLogger, safe_filename
from ocrd.workspace import Workspace
from ocrd.model import OcrdMets
Expand Down Expand Up @@ -226,4 +226,4 @@ def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_me
log.info("Writing %s", mets_fpath)
fmets.write(mets.to_xml(xmllint=True))

return Workspace(self, directory, mets)
return Workspace(self, directory, mets)
19 changes: 9 additions & 10 deletions ocrd/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,12 @@ def resolve_image_as_pil(self, image_url, coords=None):

if coords is None:
return pil_image
else:
if image_url not in self.image_cache['cv2']:
self.image_cache['cv2'][image_url] = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
cv2_image = self.image_cache['cv2'][image_url]
poly = np.array(coords, np.int32)
region_cut = cv2_image[
np.min(poly[:, 1]):np.max(poly[:, 1]),
np.min(poly[:, 0]):np.max(poly[:, 0])
]
return Image.fromarray(region_cut)
if image_url not in self.image_cache['cv2']:
self.image_cache['cv2'][image_url] = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
cv2_image = self.image_cache['cv2'][image_url]
poly = np.array(coords, np.int32)
region_cut = cv2_image[
np.min(poly[:, 1]):np.max(poly[:, 1]),
np.min(poly[:, 0]):np.max(poly[:, 0])
]
return Image.fromarray(region_cut)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
'Flask',
'Pillow',
'click',
'click >=7<8',
'click >=7',
'jsonschema',
'lxml',
'numpy',
Expand Down
29 changes: 29 additions & 0 deletions test/model/test_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from test.base import TestCase, assets, main # pylint: disable=unused-import

from ocrd.model import OcrdAgent

# pylint: disable=no-member
class TestOcrdAgent(TestCase):

def test_basic1(self):
ag = OcrdAgent(role='FOO')
self.assertEqual(ag.role, 'FOO')

def test_basic2(self):
ag = OcrdAgent(otherrole='BAR')
self.assertEqual(ag.role, 'OTHER')
self.assertEqual(ag.otherrole, 'BAR')

def test_basic3(self):
ag = OcrdAgent(name='foobar')
self.assertEqual(ag.name, 'foobar')
ag.name = 'barfoo'
self.assertEqual(ag.name, 'barfoo')

def test_basic4(self):
ag = OcrdAgent(othertype='foobar')
self.assertEqual(ag.type, 'OTHER')
# print(ag)

if __name__ == '__main__':
main()
9 changes: 9 additions & 0 deletions test/model/test_ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ocrd.constants import MIMETYPE_PAGE, VERSION
from ocrd.model import OcrdMets

# pylint: disable=protected-access
class TestOcrdMets(TestCase):

def setUp(self):
Expand Down Expand Up @@ -68,5 +69,13 @@ def test_file_groupid(self):
f.groupId = 'foo'
self.assertEqual(f.groupId, 'foo')

def test_agent(self):
# Processor(workspace=self.workspace)
mets = self.mets
self.assertEqual(len(mets.agents), 1)
mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL')
# print(['%s'%x for x in mets.agents])
self.assertEqual(len(mets.agents), 2)

if __name__ == '__main__':
main()
26 changes: 19 additions & 7 deletions test/processor/test_processor.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
from test.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module

from ocrd.resolver import Resolver
from ocrd.processor.base import Processor
from ocrd.processor.base import Processor, run_processor

from test.base import TestCase, assets, main
DUMMY_TOOL = {'executable': 'ocrd-test', 'steps': ['recognition/post-correction']}

class DummyProcessor(Processor):

def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = {
'executable': 'ocrd-test'
}
kwargs['ocrd_tool'] = DUMMY_TOOL
kwargs['version'] = '0.0.1'
super(DummyProcessor, self).__init__(*args, **kwargs)

def process(self):
# print('# nope')
pass

class TestResolver(TestCase):

def setUp(self):
Expand All @@ -20,12 +24,20 @@ def setUp(self):

def test_verify(self):
proc = DummyProcessor(self.workspace)
self.assertEquals(proc.verify(), True)
self.assertEqual(proc.verify(), True)

def test_json(self):
DummyProcessor(self.workspace, dump_json=True)

def test_params(self):
proc = Processor(workspace=self.workspace)
self.assertEquals(proc.parameter, {})
self.assertEqual(proc.parameter, {})

def test_run_agent(self):
no_agents_before = len(self.workspace.mets.agents)
run_processor(DummyProcessor, ocrd_tool=DUMMY_TOOL, workspace=self.workspace)
self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent')
print(self.workspace.mets.agents[no_agents_before])

if __name__ == "__main__":
main()

0 comments on commit 8b1d38b

Please sign in to comment.