Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PageType.get_AllRegions to list all kinds of regions #479

Merged
merged 36 commits into from
Jun 4, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
abef352
PageType.get_AllRegions to list all kinds of regions
kba May 13, 2020
3445f87
Update ocrd_models/ocrd_page_user_methods.py
bertsky May 14, 2020
a48b8c1
update generateds page, add region filter if using reading order, wip
kba May 14, 2020
f51a2e4
Merge branch 'hotfix-ocrd-page-exports' into get-all-regions
kba May 14, 2020
8da3f3c
Merge branch 'get-all-regions' of https://github.com/kba/ocrd-core in…
kba May 14, 2020
d2a01bb
refactoring: move generateDS methods to their own files
kba May 15, 2020
be7f026
get_AllRegions: adapt to signature proposed in #240, test with order=…
kba May 15, 2020
e1740f7
README: explain how to add user methods to PAGE API
kba May 15, 2020
6f9163e
Update ocrd_models/README.md
kba May 28, 2020
0c73b3e
Update ocrd_models/README.md
kba May 28, 2020
5c2f3a8
Update ocrd_models/README.md
kba May 28, 2020
6a57506
recursion (with both finite or arbitrary depth) for get_AllRegions
kba May 28, 2020
a9072c8
regenerate PAGE API
kba May 28, 2020
ac62b85
get_AllRegions: clean-up merge artifacts and reorganize
kba May 28, 2020
fd6d545
Update ocrd_models/ocrd_page_user_methods/get_AllRegions.py
kba May 28, 2020
86a7133
get_AllRegions: _region_id method unneccessary now
kba May 28, 2020
ce06392
Merge branch 'get-all-regions' of https://github.com/kba/ocrd-core in…
kba May 28, 2020
5c8d89b
regenerate PAGE API
kba May 28, 2020
f6e3da5
:art: pylint
kba May 28, 2020
8351056
add_AllIndexed -> extend_AllIndexed
kba May 28, 2020
f202205
get_AllRegions: differentiate "reading-order"/"reading-order-only"
kba May 28, 2020
ffba6f9
get_AllRegions: catch negative depth, test depth==0
kba May 29, 2020
207f396
:memo: get_AllRegions: document example
bertsky May 29, 2020
9ced315
get_AllRegions: fix recursion
kba May 29, 2020
629f38d
get_AllRegions: Update example
kba May 29, 2020
e958559
wip
kba May 29, 2020
1964563
reading order test sample: add unorderedgroups for testing
kba May 29, 2020
27e256f
add get_UnorderedGroupChildren, let get_AllIndexed handle UnorderedGr…
kba May 29, 2020
1b17e3f
get_AllIndexed: allow filtering by child type
kba May 29, 2020
ae613cf
get_AllIndexed: index_sort parameter to enable/disable sorting
kba May 29, 2020
b1df95f
add sort_AllIndexed to sort in-place
kba May 29, 2020
fd9dc83
extend_AllIndexed: increment @index when adding elements
kba May 29, 2020
9d0e539
Merge branch 'master' into get-all-regions
kba May 29, 2020
84f1d33
:memo: changelog
kba May 29, 2020
0e14633
Document extend_AllIndexed validate_contiunuity param
kba Jun 3, 2020
b79474a
Merge branch 'master' into get-all-regions
kba Jun 4, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions ocrd_models/ocrd_models/ocrd_page_generateds.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# -*- coding: utf-8 -*-

#
# Generated Wed May 13 16:09:07 2020 by generateDS.py version 2.35.20.
# Python 3.7.6 (default, Jan 8 2020, 19:59:22) [GCC 7.3.0]
# Generated Thu May 14 15:35:20 2020 by generateDS.py version 2.35.20.
# Python 3.6.6 (default, Jul 24 2018, 16:39:20) [GCC 4.9.2]
#
# Command line options:
# ('-f', '')
Expand All @@ -16,7 +16,7 @@
# repo/assets/data/schema/data/2019.xsd
#
# Command line:
# /home/kba/miniconda3/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd
# /data/monorepo/venv3.6/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd
#
# Current working directory (os.getcwd()):
# core
Expand Down Expand Up @@ -2850,7 +2850,40 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec
obj_.original_tagname_ = 'CustomRegion'
def __hash__(self):
return hash(self.id)
# end class PageType

def get_AllRegions(self, regions=None, reading_order=False):
""""
Get all the *Region element or only those provided by ``regions``.
Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED)
"""
if reading_order:
reading_order = self.get_ReadingOrder()
if not regions:
regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown']
ret = []
for region in regions:
ret += getattr(self, 'get_{}Region'.format(region))()
if reading_order:
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
if reading_order:
def get_recursive_reading_order(rogroup):
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
elements = rogroup.get_AllIndexed()
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)):
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
regionrefs = list()
for elem in elements:
regionrefs.append(elem.get_regionRef())
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
regionrefs.extend(get_recursive_reading_order(elem))
return regionrefs
reading_order = get_recursive_reading_order(reading_order)
if reading_order:
ret = dict([(region.id, region) for region in ret])
return [ret[region_id] for region_id in reading_order if region_id in ret]
else:
return ret
# end class PageType


class CoordsType(GeneratedsSuper):
Expand Down
40 changes: 40 additions & 0 deletions ocrd_models/ocrd_page_user_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,45 @@ def show(self):
#
# Replace the following method specifications with your own.

#
# List all *Regions on the PAGE
#
get_AllRegions = MethodSpec(name='get_AllRegions',
source=r'''
def get_AllRegions(self, regions=None, reading_order=False):
""""
Get all the *Region element or only those provided by ``regions``.
Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED)
"""
if reading_order:
reading_order = self.get_ReadingOrder()
if not regions:
regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown']
ret = []
for region in regions:
ret += getattr(self, 'get_{}Region'.format(region))()
if reading_order:
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
if reading_order:
def get_recursive_reading_order(rogroup):
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
elements = rogroup.get_AllIndexed()
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)):
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
regionrefs = list()
for elem in elements:
regionrefs.append(elem.get_regionRef())
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
regionrefs.extend(get_recursive_reading_order(elem))
return regionrefs
reading_order = get_recursive_reading_order(reading_order)
if reading_order:
ret = dict([(region.id, region) for region in ret])
ret = [ret[region_id] for region_id in reading_order if region_id in ret]
ret = [r in ret if r.__class__.__name__.replace('RegionType', '') in regions
kba marked this conversation as resolved.
Show resolved Hide resolved
return ret
kba marked this conversation as resolved.
Show resolved Hide resolved
''', class_names=r'^(PageType)$')

#
# List all *Indexed children sorted by @index
#
Expand Down Expand Up @@ -167,6 +206,7 @@ def __hash__(self):
exportChildren,
get_AllIndexed,
add_AllIndexed,
get_AllRegions,
clear_AllIndexed,
)

Expand Down
13 changes: 13 additions & 0 deletions tests/model/test_ocrd_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,5 +184,18 @@ def test_empty_groups_to_regionrefindexed(self):
children = og.get_AllIndexed()
self.assertTrue(isinstance(children[1], RegionRefIndexedType))

def test_all_regions_without_reading_order(self):
"""
https://github.com/OCR-D/core/pull/479
https://github.com/OCR-D/core/issues/240#issuecomment-493135797
"""
with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f:
pcgts = parseString(f.read().encode('utf8'), silence=True)
pg = pcgts.get_Page()
self.assertEqual(len(pg.get_AllRegions()), 45)
self.assertEqual(len(pg.get_AllRegions(['Separator'])), 25)
self.assertEqual(len(pg.get_AllRegions(['Table'])), 3)
self.assertEqual(len(pg.get_AllRegions(['Text'])), 17)

if __name__ == '__main__':
main()