From b598deeaa187a44091f3a95c723b553c23cb0af8 Mon Sep 17 00:00:00 2001 From: wrznr Date: Wed, 17 Apr 2019 13:55:56 +0200 Subject: [PATCH 1/6] [WIP] Add deskewing per tesseract Via `AnalyseLayout` and `OSD`, tesseract can determine the skew angle for images. The new wrapper applies this to pages and regions. It is not clear yet how to save the estimated skew angle in PAGE XML. Cf. https://github.com/PRImA-Research-Lab/PAGE-XML/issues/9 --- ocrd_tesserocr/__init__.py | 1 + ocrd_tesserocr/cli.py | 6 +++ ocrd_tesserocr/deskew.py | 91 +++++++++++++++++++++++++++++++++++ ocrd_tesserocr/ocrd-tool.json | 22 +++++++++ setup.py | 6 ++- 5 files changed, 124 insertions(+), 2 deletions(-) create mode 100644 ocrd_tesserocr/deskew.py diff --git a/ocrd_tesserocr/__init__.py b/ocrd_tesserocr/__init__.py index 20d4b86..b29b063 100644 --- a/ocrd_tesserocr/__init__.py +++ b/ocrd_tesserocr/__init__.py @@ -11,3 +11,4 @@ from .segment_line import TesserocrSegmentLine from .segment_region import TesserocrSegmentRegion from .crop import TesserocrCrop +from .deskew import TesserocrDeskew diff --git a/ocrd_tesserocr/cli.py b/ocrd_tesserocr/cli.py index 9d836c8..c43190c 100644 --- a/ocrd_tesserocr/cli.py +++ b/ocrd_tesserocr/cli.py @@ -6,6 +6,7 @@ from ocrd_tesserocr.segment_line import TesserocrSegmentLine from ocrd_tesserocr.segment_word import TesserocrSegmentWord from ocrd_tesserocr.crop import TesserocrCrop +from ocrd_tesserocr.deskew import TesserocrDeskew @click.command() @ocrd_cli_options @@ -31,3 +32,8 @@ def ocrd_tesserocr_recognize(*args, **kwargs): @ocrd_cli_options def ocrd_tesserocr_crop(*args, **kwargs): return ocrd_cli_wrap_processor(TesserocrCrop, *args, **kwargs) + +@click.command() +@ocrd_cli_options +def ocrd_tesserocr_deskew(*args, **kwargs): + return ocrd_cli_wrap_processor(TesserocrDeskew, *args, **kwargs) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py new file mode 100644 index 0000000..236a75e --- /dev/null +++ b/ocrd_tesserocr/deskew.py @@ -0,0 +1,91 @@ +from __future__ import absolute_import + +import locale + +# pylint: disable=wrong-import-position +locale.setlocale(locale.LC_ALL, 'C') # circumvent tesseract-ocr issue 1670 (which cannot be done on command line because Click requires an UTF-8 locale in Python 3) + +from tesserocr import RIL, PSM, PyTessBaseAPI + +from ocrd_utils import getLogger, concat_padded, xywh_from_points, points_from_x0y0x1y1, MIMETYPE_PAGE +from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import ( + CoordsType, + GlyphType, + LabelType, + LabelsType, + MetadataItemType, + TextEquivType, + TextStyleType, + + to_xml +) +from ocrd import Processor +from .config import TESSDATA_PREFIX, OCRD_TOOL + +log = getLogger('processor.TesserocrDeskew') + +class TesserocrDeskew(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-deskew'] + kwargs['version'] = OCRD_TOOL['version'] + super(TesserocrDeskew, self).__init__(*args, **kwargs) + + def process(self): + """ + Performs the deskewing. + """ + # print(self.parameter) + oplevel = self.parameter['operation_level'] + with PyTessBaseAPI(path=TESSDATA_PREFIX, psm=PSM.AUTO_OSD) as tessapi: + for (n, input_file) in enumerate(self.input_files): + log.info("INPUT FILE %i / %s", n, input_file) + pcgts = page_from_file(self.workspace.download_file(input_file)) + pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) + + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=OCRD_TOOL['tools']['ocrd-tesserocr-deskew']['steps'][0], + value='ocrd-tesserocr-deskew', + Labels=[LabelsType(externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + log.info("Deskewing on '%s' level on page '%s'", oplevel, pcgts.get_pcGtsId()) + + if oplevel == 'page': + self._process_page(tessapi, pil_image) + elif oplevel == 'region': + regions = pcgts.get_Page().get_TextRegion() + if not regions: + log.warning("Deskewing regions requested but page contains no text regions") + self._process_regions(regions, tessapi, pil_image) + + ID = concat_padded(self.output_file_grp, n) + self.workspace.add_file( + ID=ID, + file_grp=self.output_file_grp, + mimetype=MIMETYPE_PAGE, + local_filename='%s/%s' % (self.output_file_grp, ID), + content=to_xml(pcgts), + ) + + def _process_page(self, tessapi, pil_image): + tessapi.SetImage(pil_image) + orientation, direction, order, deskew_angle = tessapi.AnalyseLayout().Orientation() + log.debug("Deskew angle: {:.4f}".format(deskew_angle)) + + def _process_regions(self, regions, tessapi, pil_image): + for region in regions: + log.debug("Deskewing region '%s'", region.id) + region_xywh = xywh_from_points(region.get_Coords().points) + + # Note: we set the image instead of specifying a rectangle! + pil_region_image = pil_image.crop((region_xywh['x'], region_xywh['y'], region_xywh['x'] + region_xywh['w'], region_xywh['y'] + region_xywh['h'])) + tessapi.SetImage(pil_region_image) + + orientation, direction, order, deskew_angle = tessapi.AnalyseLayout().Orientation() + log.debug("Deskew angle: {:.4f}".format(deskew_angle)) + region.set_orientation(deskew_angle) diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index 3b35183..59f12ac 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -3,6 +3,28 @@ "git_url": "https://github.com/OCR-D/ocrd_tesserocr", "dockerhub": "ocrd/tesserocr", "tools": { + "ocrd-tesserocr-deskew": { + "executable": "ocrd-tesserocr-deskew", + "categories": ["Deskewing"], + "description": "Deskew images or image parts", + "input_file_grp": [ + "OCR-D-IMG", + "OCR-D-SEG-BLOCK" + ], + "output_file_grp": [ + "OCR-D-DESKEW-IMG", + "OCR-D-DESKEW-BLOCK" + ], + "steps": ["preprocessing/optimization/deskewing"], + "parameters": { + "operation_level": { + "type": "string", + "enum": ["page","region"], + "default": "page", + "description": "Level of operation for deskewing" + } + } + }, "ocrd-tesserocr-recognize": { "executable": "ocrd-tesserocr-recognize", "categories": ["Text recognition and optimization"], diff --git a/setup.py b/setup.py index c902afe..2acf4bf 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ - ocrd_tesserocr_segment_line - ocrd_tesserocr_segment_word - ocrd_tesserocr_crop + - ocrd_tesserocr_deskew """ import codecs @@ -17,8 +18,8 @@ version='0.2.2', description='Tesserocr bindings', long_description=codecs.open('README.rst', encoding='utf-8').read(), - author='Konstantin Baierer', - author_email='unixprog@gmail.com', + author='Konstantin Baierer, Kay-Michael Würzner', + author_email='unixprog@gmail.com, wuerzner@gmail.com', url='https://github.com/OCR-D/ocrd_tesserocr', license='Apache License 2.0', packages=find_packages(exclude=('tests', 'docs')), @@ -33,6 +34,7 @@ 'ocrd-tesserocr-segment-line=ocrd_tesserocr.cli:ocrd_tesserocr_segment_line', 'ocrd-tesserocr-segment-word=ocrd_tesserocr.cli:ocrd_tesserocr_segment_word', 'ocrd-tesserocr-crop=ocrd_tesserocr.cli:ocrd_tesserocr_crop', + 'ocrd-tesserocr-deskew=ocrd_tesserocr.cli:ocrd_tesserocr_deskew', ] }, ) From 5107f3f58e101b619484a3526fcce4807f0ca4a1 Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Fri, 28 Jun 2019 11:30:18 +0200 Subject: [PATCH 2/6] implement AlternativeImage-based processing: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - base all processors on AlternativeImage - make all processors create a parameter MetadataItem - make all processors create output file names from the input files, and use .xml extension for PAGE - introduce a `common` module along the lines of the ocropy wrapper (but without ocropy-specific segmentation), i.e. functions to be moved into core: - polygon_mask - rotate_polygon - image_from_page - image_from_region - image_from_line - image_from_word - image_from_glyph - save_image_file - bbox_from_points - points_from_bbox - xywh_from_bbox - bbox_from_xywh - points_from_polygon - in crop: - set textord_tabfind_find_tables=0 (because with table detection, the hinge often gets confused with a table column) - if a Border already exists, warn that it will be overwritten - if TextRegions already exist, calculate their common extent and warn it will be ignored - use PSM.SPARSE_TEXT instead of PSM.AUGO (so no images regions creep into neighbouring pages) - ignore regions which are empty after binarization - ignore regions with tiny width or height (< 30px) - add a padding to the result on all sides (4px) - do not annotate a (wrong) Border if no regions have been found - in deskew: - convert skewing angle from radians to degrees, and mind the direction (clockwise in PAGE, but mathematically positive in Pillow) and map to the numeric interval (-179,180) - add orientation (+90/180/270) to skewing angle - also rotate the raw image of the page/region (expand and fill with white) and store as file; reference in METS (under OCR-D-IMG-DESKEW) and in PAGE (as AlternativeImage, with appropriate comments) - annotate writing direction and textline order in PAGE too - use OSD (DetectOrientationScript) in addition to layout analysis (AnalyseLayout/Orientation), with confidence thresholds (>= 10): ensure that orientation is consistent between both (and in case of conflict, use the former), also annotate primary script; init appropriately (i.e. load "osd", use legacy OEM and AUTO_OSD) - on region level, process TableRegions as well - change default operation_level to region (because we still cannot annotate orientation on page level) - in segment_region: - add parameter `find_tables` (default: true) to allow disabling table detection (textord_tabfind_find_tables=0), so they can be analysed into independent text/sep regions - add parameter `overwrite_regions` (default: true) to allow enabling removal of any existing text regions - unconditionally remove any existing non-text regions and reading order groups - cover PT.VERTICAL_TEXT (as TextRegionType) and PT.TABLE (as TableRegionType) - use BlockPolygon (if present) to annotate polygon outline in Coords – but comment away, because patch against tesserocr segfaults awaits merge - add parameter `crop_polygons` (default: false) to enable: retrieve the raw region image along the (internal) polygon outline, store image as file, and reference in METS (under OCR-D-IMG-CROP) and in PAGE (as AlternativeImage) - in segment_line, add parameter `overwrite_lines` (default: true) to allow enabling removal of any existing text lines - in segment_word, add parameter `overwrite_words` (default: true) to allow enabling removal of any existing words - new processor binarize: - operate on page, region or line level - retrieve cropped, raw page/region/line image, then enter PSM.AUTO/SINGLE_BLOCK/SINGLE_LINE, and run layout analysis on the image, retrieve the binary image for RIL.BLOCK/TEXTLINE store image as file, and reference in METS (under OCR-D-IMG-BIN) and in PAGE (as AlternativeImage) - improve docstrings - remove redundant locale workaround from config (already in __init__) - version 0.2.2 → 0.2.3 --- .pylintrc | 14 +- CHANGELOG.md | 20 ++ ocrd_tesserocr/__init__.py | 1 + ocrd_tesserocr/binarize.py | 139 ++++++++++ ocrd_tesserocr/cli.py | 6 + ocrd_tesserocr/common.py | 456 +++++++++++++++++++++++++++++++ ocrd_tesserocr/config.py | 2 - ocrd_tesserocr/crop.py | 154 ++++++++--- ocrd_tesserocr/deskew.py | 239 ++++++++++++---- ocrd_tesserocr/ocrd-tool.json | 64 ++++- ocrd_tesserocr/recognize.py | 262 ++++++++++-------- ocrd_tesserocr/segment_line.py | 104 +++++-- ocrd_tesserocr/segment_region.py | 302 +++++++++++++++----- ocrd_tesserocr/segment_word.py | 100 +++++-- setup.py | 4 +- 15 files changed, 1527 insertions(+), 340 deletions(-) create mode 100644 ocrd_tesserocr/binarize.py create mode 100644 ocrd_tesserocr/common.py diff --git a/.pylintrc b/.pylintrc index 67d8439..dda4d9e 100644 --- a/.pylintrc +++ b/.pylintrc @@ -5,13 +5,23 @@ ignored-modules=cv2,tesserocr [MESSAGES CONTROL] disable = ungrouped-imports, - fixme, +# fixme, bad-continuation, missing-docstring, no-self-use, - too-many-arguments, superfluous-parens, invalid-name, line-too-long, + too-many-arguments, + too-many-branches, + too-many-statements, too-many-locals, too-few-public-methods, + wrong-import-order, + duplicate-code + +# allow indented whitespace (as required by interpreter): +no-space-check=empty-line + +# allow non-snake-case identifiers: +good-names=n,i diff --git a/CHANGELOG.md b/CHANGELOG.md index c42b210..c853e87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,26 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.2.3] - 2019-06-28 + +Changed: + * Use basename of input file for output name + * Use .xml filename extension for PAGE output + * Warn about existing border or regions in `crop` + * Use `PSM.SPARSE_TEXT` without tables in `crop` + * Filter unreliable regions in `crop` + * Add padding around border in `crop` + * Delete existing regions in `segment_region` + * Cover vertical text and tables in `segment_region` + * Add parameter `find_tables` in `segment_region` + * Add parameter `crop_polygons` in `segment_region` + * Add parameter `overwrite_regions` in `segment_region` + * Add parameter `overwrite_lines` in `segment_line` + * Add parameter `overwrite_words` in `segment_word` + * Add page/region-level processor `deskew` + * Add page/region/line-level processor `binarize` + * Respect AlternativeImage on all levels + ## [0.2.2] - 2019-05-20 Changed: diff --git a/ocrd_tesserocr/__init__.py b/ocrd_tesserocr/__init__.py index b29b063..c052054 100644 --- a/ocrd_tesserocr/__init__.py +++ b/ocrd_tesserocr/__init__.py @@ -12,3 +12,4 @@ from .segment_region import TesserocrSegmentRegion from .crop import TesserocrCrop from .deskew import TesserocrDeskew +from .binarize import TesserocrBinarize diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py new file mode 100644 index 0000000..954279f --- /dev/null +++ b/ocrd_tesserocr/binarize.py @@ -0,0 +1,139 @@ +from __future__ import absolute_import + +import os.path +from tesserocr import ( + PyTessBaseAPI, + PSM, RIL +) + +from ocrd_utils import ( + getLogger, concat_padded, + MIMETYPE_PAGE +) +from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import ( + MetadataItemType, + LabelsType, LabelType, + AlternativeImageType, + TextRegionType, + to_xml +) +from ocrd import Processor + +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region, + image_from_line, + save_image_file, + membername +) + +TOOL = 'ocrd-tesserocr-binarize' +LOG = getLogger('processor.TesserocrBinarize') +FILEGRP_IMG = 'OCR-D-IMG-BIN' + +class TesserocrBinarize(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] + kwargs['version'] = OCRD_TOOL['version'] + super(TesserocrBinarize, self).__init__(*args, **kwargs) + + def process(self): + """Performs binarization with Tesseract on the workspace. + + Open and deserialise PAGE input files and their respective images, + then iterate over the element hierarchy down to the requested level. + + Set up Tesseract to recognise the segment image's layout, and get + the binarized image. Create an image file, and reference it as + AlternativeImage in the element and as file with a fileGrp USE + equal `OCR-D-IMG-BIN` in the workspace. + + Produce a new output file by serialising the resulting hierarchy. + """ + oplevel = self.parameter['operation_level'] + with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: + for n, input_file in enumerate(self.input_files): + file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) + pcgts = page_from_file(self.workspace.download_file(input_file)) + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) + + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + if oplevel == 'page': + tessapi.SetPageSegMode(PSM.AUTO) + self._process_segment(tessapi, RIL.BLOCK, page, page_image, page_xywh, + "page '%s'" % page_id, input_file.pageId, + file_id) + else: + regions = page.get_TextRegion() + page.get_TableRegion() + if not regions: + LOG.warning("Page '%s' contains no text regions", page_id) + for region in regions: + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) + if oplevel == 'region': + tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) + self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh, + "region '%s'" % region.id, input_file.pageId, + file_id + '_' + region.id) + elif isinstance(region, TextRegionType): + lines = region.get_TextLine() + if not lines: + LOG.warning("Page '%s' region '%s' contains no text lines", + page_id, region.id) + for line in lines: + line_image, line_xywh = image_from_line( + self.workspace, line, region_image, region_xywh) + tessapi.SetPageSegMode(PSM.SINGLE_LINE) + self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh, + "line '%s'" % line.id, input_file.pageId, + file_id + '_' + region.id + '_' + line.id) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + mimetype=MIMETYPE_PAGE, + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) + + def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, file_id): + tessapi.SetImage(image) + image_bin = None + layout = tessapi.AnalyseLayout() + if layout: + image_bin = layout.GetBinaryImage(ril) + if not image_bin: + LOG.error('Cannot binarize %s', where) + return + # update METS (add the image file): + file_path = save_image_file(self.workspace, image_bin, + file_id, + page_id=page_id, + file_grp=FILEGRP_IMG) + # update PAGE (reference the image file): + segment.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments="binarized")) diff --git a/ocrd_tesserocr/cli.py b/ocrd_tesserocr/cli.py index c43190c..eb3f0dc 100644 --- a/ocrd_tesserocr/cli.py +++ b/ocrd_tesserocr/cli.py @@ -7,6 +7,7 @@ from ocrd_tesserocr.segment_word import TesserocrSegmentWord from ocrd_tesserocr.crop import TesserocrCrop from ocrd_tesserocr.deskew import TesserocrDeskew +from ocrd_tesserocr.binarize import TesserocrBinarize @click.command() @ocrd_cli_options @@ -37,3 +38,8 @@ def ocrd_tesserocr_crop(*args, **kwargs): @ocrd_cli_options def ocrd_tesserocr_deskew(*args, **kwargs): return ocrd_cli_wrap_processor(TesserocrDeskew, *args, **kwargs) + +@click.command() +@ocrd_cli_options +def ocrd_tesserocr_binarize(*args, **kwargs): + return ocrd_cli_wrap_processor(TesserocrBinarize, *args, **kwargs) diff --git a/ocrd_tesserocr/common.py b/ocrd_tesserocr/common.py new file mode 100644 index 0000000..57b391b --- /dev/null +++ b/ocrd_tesserocr/common.py @@ -0,0 +1,456 @@ +from __future__ import absolute_import + +import os.path +import sys +import io + +import numpy as np +from PIL import Image, ImageDraw + +from ocrd_utils import getLogger, xywh_from_points, polygon_from_points + +LOG = getLogger('') # to be refined by importer + +# dummy (not available without ocrolib) +def resegment(mask_image, labels): + return mask_image + +# to be refactored into core (as function in ocrd_utils): +def polygon_mask(image, coordinates): + mask = Image.new('L', image.size, 0) + ImageDraw.Draw(mask).polygon(coordinates, outline=1, fill=255) + return mask + +# to be refactored into core (as function in ocrd_utils): +def rotate_polygon(coordinates, angle, orig={'x': 0, 'y': 0}): + # if the region image has been rotated, we must also + # rotate the coordinates of the line + # (which relate to the top page image) + # in the same direction but with inverse transformation + # matrix (i.e. passive rotation), and + # (since the region was rotated around its center, + # but our coordinates are now relative to the top left) + # by first translating to center of region, then + # rotating around that center, and translating back: + # point := (point - region_center) * region_rotation + region_center + # moreover, since rotation has reshaped/expanded the image, + # the line coordinates must be offset by those additional pixels: + # point := point + 0.5 * (new_region_size - old_region_size) + angle = np.deg2rad(angle) + # active rotation: [[cos, -sin], [sin, cos]] + # passive rotation: [[cos, sin], [-sin, cos]] (inverse) + return [(orig['x'] + + (x - orig['x'])*np.cos(angle) + + (y - orig['y'])*np.sin(angle), + orig['y'] + - (x - orig['x'])*np.sin(angle) + + (y - orig['y'])*np.cos(angle)) + for x, y in coordinates] + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_page(workspace, page, + page_image, + page_id): + """Extract the Page image from the workspace. + + Given a PIL.Image of the page, `page_image`, + and the Page object logically associated with it, `page`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `page_image` (if a Border exists), + or by just returning `page_image` (otherwise). + + When using AlternativeImage, if the resulting page image + is larger than the annotated page, then pass down the page's + box coordinates with an offset of half the width/height difference. + + Return the extracted image, and the page's box coordinates, + relative to the source image (for passing down). + """ + page_xywh = {'x': 0, + 'y': 0, + 'w': page_image.width, + 'h': page_image.height} + # FIXME: remove PrintSpace here as soon as GT abides by the PAGE standard: + border = page.get_Border() or page.get_PrintSpace() + if border and border.get_Coords(): + LOG.debug("Using explictly set page border '%s' for page '%s'", + border.get_Coords().points, page_id) + page_xywh = xywh_from_points(border.get_Coords().points) + + alternative_image = page.get_AlternativeImage() + if alternative_image: + # (e.g. from page-level cropping, binarization, deskewing or despeckling) + # assumes implicit cropping (i.e. page_xywh has been applied already) + LOG.debug("Using AlternativeImage %d (%s) for page '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + page_id) + page_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + elif border: + page_image = page_image.crop( + box=(page_xywh['x'], + page_xywh['y'], + page_xywh['x'] + page_xywh['w'], + page_xywh['y'] + page_xywh['h'])) + # FIXME: mask away all GraphicRegion, SeparatorRegion etc which + # could overlay any text regions + # subtract offset from any increase in binary region size over source: + page_xywh['x'] -= 0.5 * max(0, page_image.width - page_xywh['w']) + page_xywh['y'] -= 0.5 * max(0, page_image.height - page_xywh['h']) + return page_image, page_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_region(workspace, region, + page_image, page_xywh): + """Extract the TextRegion image from a Page image. + + Given a PIL.Image of the page, `page_image`, + and its coordinates relative to the border, `page_xywh`, + and a TextRegion object logically contained in it, `region`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `page_image`. + + When cropping, respect any angle annotated for the region + (from deskewing) by rotating the cropped image, respectively. + Regardless, if the resulting region image is larger than + the annotated region, pass down the region's box coordinates + with an offset of half the width/height difference. + + Return the extracted image, and the region's box coordinates, + relative to the page image (for passing down). + """ + region_xywh = xywh_from_points(region.get_Coords().points) + # region angle: PAGE orientation is defined clockwise, + # whereas PIL/ndimage rotation is in mathematical direction: + region_xywh['angle'] = -(region.get_orientation() or 0) + alternative_image = region.get_AlternativeImage() + if alternative_image: + # (e.g. from region-level cropping, binarization, deskewing or despeckling) + LOG.debug("Using AlternativeImage %d (%s) for region '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + region.id) + region_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + else: + region_image = page_image.crop( + box=(region_xywh['x'] - page_xywh['x'], + region_xywh['y'] - page_xywh['y'], + region_xywh['x'] - page_xywh['x'] + region_xywh['w'], + region_xywh['y'] - page_xywh['y'] + region_xywh['h'])) + # FIXME: mask any overlapping regions (esp. Separator/Noise/Image) + # but we might need overlapping rules: e.g. an ImageRegion which + # properly contains our TextRegion should be completely ignored, but + # an ImageRegion which is properly contained in our TextRegion should + # be completely masked, while partial overlap may be more difficult + # to decide (use polygons?) + if region_xywh['angle']: + LOG.info("About to rotate region '%s' by %.2f°", + region.id, region_xywh['angle']) + region_image = region_image.rotate(region_xywh['angle'], + expand=True, + #resample=Image.BILINEAR, + fillcolor='white') + # subtract offset from any increase in binary region size over source: + region_xywh['x'] -= 0.5 * max(0, region_image.width - region_xywh['w']) + region_xywh['y'] -= 0.5 * max(0, region_image.height - region_xywh['h']) + return region_image, region_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_line(workspace, line, + region_image, region_xywh, + segmentation=None): + """Extract the TextLine image from a TextRegion image. + + Given a PIL.Image of the region, `region_image`, + and its coordinates relative to the page, `region_xywh`, + and a TextLine object logically contained in it, `line`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `region_image`. + + When cropping, respect any angle annotated for the region + (from deskewing) by compensating the line coordinates in + an inverse transformation (translation to center, rotation, + re-translation). Also, mind the difference between annotated + and actual size of the region (usually from deskewing), by + a respective offset into the image. Cropping uses a polygon + mask (not just the rectangle). + + If passed an optional labelling for the region, `segmentation`, + the mask is shrinked further to the largest overlapping line + label, which avoids seeing ascenders from lines below, and + descenders from lines above `line`. + + If the resulting line image is larger than the annotated line, + pass down the line's box coordinates with an offset of half + the width/height difference. + + Return the extracted image, and the line's box coordinates, + relative to the region image (for passing down). + """ + line_points = line.get_Coords().points + line_xywh = xywh_from_points(line_points) + line_polygon = [(x - region_xywh['x'], + y - region_xywh['y']) + for x, y in polygon_from_points(line_points)] + alternative_image = line.get_AlternativeImage() + if alternative_image: + # (e.g. from line-level cropping, deskewing or despeckling) + LOG.debug("Using AlternativeImage %d (%s) for line '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + line.id) + line_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + else: + # create a mask from the line polygon: + line_polygon = rotate_polygon(line_polygon, + region_xywh['angle'], + orig={'x': 0.5 * region_image.width, + 'y': 0.5 * region_image.height}) + line_mask = polygon_mask(region_image, line_polygon) + if isinstance(segmentation, np.ndarray): + # modify mask from (ad-hoc) line segmentation of region + # (shrink to largest label spread in that area): + line_mask = resegment(line_mask, segmentation) + # create a background image from its median color + # (in case it has not been binarized yet): + region_array = np.asarray(region_image) + background = np.median(region_array, axis=[0, 1], keepdims=True) + region_array = np.broadcast_to(background.astype(np.uint8), region_array.shape) + line_image = Image.fromarray(region_array) + line_image.paste(region_image, mask=line_mask) + # recrop into a line: + bbox = line_mask.getbbox() + if bbox: + left, upper, right, lower = bbox + # keep upper/lower, regardless of h (no vertical padding) + # pad left/right if target width w is larger: + margin_x = (line_xywh['w'] - right + left) // 2 + left = max(0, left - margin_x) + right = min(line_mask.width, left + line_xywh['w']) + else: + left = line_xywh['x'] - region_xywh['x'] + upper = line_xywh['y'] - region_xywh['y'] + right = left + line_xywh['w'] + lower = upper + line_xywh['h'] + line_image = line_image.crop(box=(left, upper, right, lower)) + # subtract offset from any increase in binary line size over source: + line_xywh['x'] -= 0.5 * max(0, line_image.width - line_xywh['w']) + line_xywh['y'] -= 0.5 * max(0, line_image.height - line_xywh['h']) + return line_image, line_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_word(workspace, word, + line_image, line_xywh): + """Extract the Word image from a TextLine image. + + Given a PIL.Image of the line, `line_image`, + and its coordinates relative to the region, `line_xywh`, + and a Word object logically contained in it, `word`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `line_image`. + + When cropping, mind the difference between annotated + and actual size of the line (usually from deskewing), by + a respective offset into the image. Cropping uses a polygon + mask (not just the rectangle). + + If the resulting word image is larger than the annotated word, + pass down the word's box coordinates with an offset of half + the width/height difference. + + Return the extracted image, and the word's box coordinates, + relative to the line image (for passing down). + """ + word_points = word.get_Coords().points + word_xywh = xywh_from_points(word_points) + word_polygon = [(x - line_xywh['x'], + y - line_xywh['y']) + for x, y in polygon_from_points(word_points)] + alternative_image = word.get_AlternativeImage() + if alternative_image: + # (e.g. from word-level cropping or binarization) + LOG.debug("Using AlternativeImage %d (%s) for word '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + word.id) + word_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + else: + # create a mask from the word polygon: + word_mask = polygon_mask(line_image, word_polygon) + # create a background image from its median color + # (in case it has not been binarized yet): + line_array = np.asarray(line_image) + background = np.median(line_array, axis=[0, 1], keepdims=True) + line_array = np.broadcast_to(background.astype(np.uint8), line_array.shape) + word_image = Image.fromarray(line_array) + word_image.paste(line_image, mask=word_mask) + # recrop into a line: + bbox = word_mask.getbbox() + if bbox: + left, upper, right, lower = bbox + # keep upper/lower, regardless of h (no vertical padding) + # pad left/right if target width w is larger: + margin_x = (word_xywh['w'] - right + left) // 2 + left = max(0, left - margin_x) + right = min(word_mask.width, left + word_xywh['w']) + else: + left = word_xywh['x'] - line_xywh['x'] + upper = word_xywh['y'] - line_xywh['y'] + right = left + word_xywh['w'] + lower = upper + word_xywh['h'] + word_image = word_image.crop(box=(left, upper, right, lower)) + # subtract offset from any increase in binary line size over source: + word_xywh['x'] -= 0.5 * max(0, word_image.width - word_xywh['w']) + word_xywh['y'] -= 0.5 * max(0, word_image.height - word_xywh['h']) + return word_image, word_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_glyph(workspace, glyph, + word_image, word_xywh): + """Extract the Glyph image from a Word image. + + Given a PIL.Image of the word, `word_image`, + and its coordinates relative to the line, `word_xywh`, + and a Glyph object logically contained in it, `glyph`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `word_image`. + + When cropping, mind the difference between annotated + and actual size of the word (usually from deskewing), by + a respective offset into the image. Cropping uses a polygon + mask (not just the rectangle). + + If the resulting glyph image is larger than the annotated glyph, + pass down the glyph's box coordinates with an offset of half + the width/height difference. + + Return the extracted image, and the glyph's box coordinates, + relative to the word image (for passing down). + """ + glyph_points = glyph.get_Coords().points + glyph_xywh = xywh_from_points(glyph_points) + glyph_polygon = [(x - word_xywh['x'], + y - word_xywh['y']) + for x, y in polygon_from_points(glyph_points)] + alternative_image = glyph.get_AlternativeImage() + if alternative_image: + # (e.g. from glyph-level cropping or binarization) + LOG.debug("Using AlternativeImage %d (%s) for glyph '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + glyph.id) + glyph_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + else: + # create a mask from the glyph polygon: + glyph_mask = polygon_mask(word_image, glyph_polygon) + # create a background image from its median color + # (in case it has not been binarized yet): + word_array = np.asarray(word_image) + background = np.median(word_array, axis=[0, 1], keepdims=True) + word_array = np.broadcast_to(background.astype(np.uint8), word_array.shape) + glyph_image = Image.fromarray(word_array) + glyph_image.paste(word_image, mask=glyph_mask) + # recrop into a word: + bbox = glyph_mask.getbbox() + if bbox: + left, upper, right, lower = bbox + # keep upper/lower, regardless of h (no vertical padding) + # pad left/right if target width w is larger: + margin_x = (glyph_xywh['w'] - right + left) // 2 + left = max(0, left - margin_x) + right = min(glyph_mask.width, left + glyph_xywh['w']) + else: + left = glyph_xywh['x'] - word_xywh['x'] + upper = glyph_xywh['y'] - word_xywh['y'] + right = left + glyph_xywh['w'] + lower = upper + glyph_xywh['h'] + glyph_image = glyph_image.crop(box=(left, upper, right, lower)) + # subtract offset from any increase in binary word size over source: + glyph_xywh['x'] -= 0.5 * max(0, glyph_image.width - glyph_xywh['w']) + glyph_xywh['y'] -= 0.5 * max(0, glyph_image.height - glyph_xywh['h']) + return glyph_image, glyph_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def save_image_file(workspace, image, + file_id, + page_id=None, + file_grp='OCR-D-IMG', # or -BIN? + format='PNG', + force=True): + """Store and reference an image as file into the workspace. + + Given a PIL.Image `image`, and an ID `file_id` to use in METS, + store the image under the fileGrp `file_grp` and physical page + `page_id` into the workspace (in a file name based on + the `file_grp`, `file_id` and `format` extension). + + Return the (absolute) path of the created file. + """ + image_bytes = io.BytesIO() + image.save(image_bytes, format=format) + file_path = os.path.join(file_grp, + file_id + '.' + format.lower()) + out = workspace.add_file( + ID=file_id, + file_grp=file_grp, + pageId=page_id, + local_filename=file_path, + mimetype='image/' + format.lower(), + content=image_bytes.getvalue(), + force=force) + LOG.info('created file ID: %s, file_grp: %s, path: %s', + file_id, file_grp, out.local_filename) + return file_path + +# to be refactored into core (as function in ocrd_utils): +def bbox_from_points(points): + """Constructs a numeric list representing a bounding box from polygon coordinates in page representation.""" + xys = [[int(p) for p in pair.split(',')] for pair in points.split(' ')] + minx = sys.maxsize + miny = sys.maxsize + maxx = 0 + maxy = 0 + for xy in xys: + if xy[0] < minx: + minx = xy[0] + if xy[0] > maxx: + maxx = xy[0] + if xy[1] < miny: + miny = xy[1] + if xy[1] > maxy: + maxy = xy[1] + return minx, miny, maxx, maxy + +# to be refactored into core (as function in ocrd_utils): +def points_from_bbox(minx, miny, maxx, maxy): + """Constructs polygon coordinates in page representation from a numeric list representing a bounding box.""" + return "%i,%i %i,%i %i,%i %i,%i" % ( + minx, miny, maxx, miny, maxx, maxy, minx, maxy) + +# to be refactored into core (as function in ocrd_utils): +def xywh_from_bbox(minx, miny, maxx, maxy): + """Converts a bounding box from a numeric list to a numeric dict representation.""" + return { + 'x': minx, + 'y': miny, + 'w': maxx - minx, + 'h': maxy - miny, + } + +# to be refactored into core (as function in ocrd_utils): +def bbox_from_xywh(xywh): + """Converts a bounding box from a numeric dict to a numeric list representation.""" + return ( + xywh['x'], + xywh['y'], + xywh['x'] + xywh['w'], + xywh['y'] + xywh['h'] + ) + +# to be refactored into core (as function in ocrd_utils): +def points_from_polygon(polygon): + """Converts polygon coordinates from a numeric list representation to a page representation.""" + return " ".join("%i,%i" % (x, y) for x, y in polygon) + +def membername(class_, val): + return next((k for k, v in class_.__dict__.items() if v == val), str(val)) diff --git a/ocrd_tesserocr/config.py b/ocrd_tesserocr/config.py index 1b81509..528b184 100644 --- a/ocrd_tesserocr/config.py +++ b/ocrd_tesserocr/config.py @@ -2,8 +2,6 @@ import json from pkg_resources import resource_string -import locale -locale.setlocale(locale.LC_ALL, 'C') # circumvent tesseract-ocr issue 1670 (which cannot be done on command line because Click requires an UTF-8 locale in Python 3) import tesserocr TESSDATA_PREFIX = os.environ['TESSDATA_PREFIX'] if 'TESSDATA_PREFIX' in os.environ else tesserocr.get_languages()[0] diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index 4a8f118..fca0a55 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -1,83 +1,157 @@ from __future__ import absolute_import +import os.path + import tesserocr -from ocrd_utils import getLogger, concat_padded, points_from_xywh, MIMETYPE_PAGE +from ocrd_utils import ( + getLogger, concat_padded, + MIMETYPE_PAGE +) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( + MetadataItemType, + LabelsType, LabelType, CoordsType, - to_xml ) from ocrd_models.ocrd_page_generateds import BorderType - from ocrd import Processor -from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + bbox_from_points, points_from_bbox, + bbox_from_xywh +) + +TOOL = 'ocrd-tesserocr-crop' +LOG = getLogger('processor.TesserocrCrop') -log = getLogger('processor.TesserocrCrop') +PADDING = 4 # extend detected border by how many (true) pixels on every side? class TesserocrCrop(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-crop'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrCrop, self).__init__(*args, **kwargs) def process(self): - """ - Performs the cropping. + """Performs crude page cropping with Tesseract on the workspace. + + Open and deserialise PAGE input files and their respective images. + Set up Tesseract to detect text blocks on each page, and find + the largest coordinate extent spanning all of them. Use this + extent in defining a Border, and add that to the page. + + Produce new output files by serialising the resulting hierarchy. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: - # print(self.input_file_grp) + # disable table detection here (tables count as text blocks), + # because we do not want to risk confusing the spine with + # a column separator and thus creeping into a neighbouring + # page: + tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): - # print(input_file) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) - log.debug("Cropping with tesseract") - tessapi.SetImage(image) + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + border = page.get_Border() + if border: + left, top, right, bottom = bbox_from_points(border.get_Coords().points) + LOG.warning('Overwriting existing Border: %i:%i,%i:%i', + left, top, right, bottom) + regions = page.get_TextRegion() + if regions: + min_x = image.width + min_y = image.height + max_x = 0 + max_y = 0 + for region in regions: + left, top, right, bottom = bbox_from_points(region.get_Coords().points) + min_x = min(min_x, left) + min_y = min(min_y, top) + max_x = max(max_x, right) + max_y = max(max_y, bottom) + LOG.warning('Ignoring extent from existing TextRegions: %i:%i,%i:%i', + min_x, max_x, min_y, max_y) + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + LOG.debug("Cropping with tesseract") + tessapi.SetImage(page_image) + # PSM.SPARSE_TEXT: get as much text as possible in no particular order + # PSM.AUTO (default): includes tables (dangerous) + tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) # # helper variables for saving the box coordinates # - min_x = image.width - min_y = image.height + min_x = page_image.width + min_y = page_image.height max_x = 0 max_y = 0 - - # iterate over all boxes and compare their extent - # to the min and max values + # iterate over all text blocks and compare their + # bbox extent to the running min and max values for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): - points, index = points_from_xywh(component[1]), component[2] - + image, xywh, index, para = component # # the region reference in the reading order element # ID = "region%04d" % index - log.debug("Detected region '%s': %s", ID, points) - - for pair in points.split(' '): - x, y = (int(pair.split(',')[0]), int(pair.split(',')[1])) - if x < min_x: - min_x = x - if y < min_y: - min_y = y - elif x > max_x: - max_x = x - elif y > max_y: - max_y = y - log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) + left, top, right, bottom = bbox_from_xywh(xywh) + LOG.debug("Detected text region '%s': %i:%i,%i:%i", + ID, left, right, top, bottom) + # filter region results: + bin_bbox = image.getbbox() + if not bin_bbox: + # this does happen! + LOG.info("Ignoring region '%s' because its binarization is empty", ID) + continue + if bin_bbox[2]-bin_bbox[0] < 30 or bin_bbox[3]-bin_bbox[1] < 30: + # we must be conservative here: page numbers are tiny regions, too! + LOG.info("Ignoring region '%s' because its binarization is too small", ID) + continue + min_x = min(min_x, left) + min_y = min(min_y, top) + max_x = max(max_x, right) + max_y = max(max_y, bottom) + LOG.debug("Updated page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) # # set the identified page border # - brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) - pcgts.get_Page().set_Border(brd) + if min_x < max_x and min_y < max_y: + # add padding: + min_x = max(min_x - PADDING, 0) + max_x = min(max_x + PADDING, page_image.width) + min_y = max(min_y - PADDING, 0) + max_y = min(max_y + PADDING, page_image.height) + LOG.debug("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) + border = BorderType(Coords=CoordsType( + points_from_bbox(min_x, min_y, max_x, max_y))) + page.set_Border(border) + else: + LOG.error("Cannot find valid extent for page '%s'", page_id) - ID = concat_padded(self.output_file_grp, n) + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename='%s/%s' % (self.output_file_grp, ID), - content=to_xml(pcgts).encode('utf-8'), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index 236a75e..89be7b6 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -1,91 +1,216 @@ from __future__ import absolute_import -import locale - -# pylint: disable=wrong-import-position -locale.setlocale(locale.LC_ALL, 'C') # circumvent tesseract-ocr issue 1670 (which cannot be done on command line because Click requires an UTF-8 locale in Python 3) - -from tesserocr import RIL, PSM, PyTessBaseAPI +import os.path +import math +from tesserocr import ( + PyTessBaseAPI, + PSM, OEM, + Orientation, + WritingDirection, + TextlineOrder +) -from ocrd_utils import getLogger, concat_padded, xywh_from_points, points_from_x0y0x1y1, MIMETYPE_PAGE +from ocrd_utils import ( + getLogger, concat_padded, + MIMETYPE_PAGE +) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - CoordsType, - GlyphType, - LabelType, - LabelsType, MetadataItemType, - TextEquivType, - TextStyleType, - + LabelsType, LabelType, + AlternativeImageType, + TextRegionType, PageType, to_xml ) from ocrd import Processor + from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region, + save_image_file, + membername +) -log = getLogger('processor.TesserocrDeskew') +TOOL = 'ocrd-tesserocr-deskew' +LOG = getLogger('processor.TesserocrDeskew') +FILEGRP_IMG = 'OCR-D-IMG-DESKEW' class TesserocrDeskew(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-deskew'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrDeskew, self).__init__(*args, **kwargs) def process(self): + """Performs region-level deskewing with Tesseract on the workspace. + + Open and deserialise PAGE input files and their respective images, + then iterate over the element hierarchy down to the region level + for all text and table regions. + + Set up Tesseract to recognise the region image's orientation, skew + and script (with both OSD and AnalyseLayout). Rotate the image + accordingly, and annotate the angle, readingDirection and textlineOrder. + + Create a cropped (and possibly deskewed) image file, and reference it + as AlternativeImage in the region element and as file with a fileGrp USE + equal `OCR-D-IMG-DESKEW` in the workspace. + + Produce a new output file by serialising the resulting hierarchy. """ - Performs the deskewing. - """ - # print(self.parameter) oplevel = self.parameter['operation_level'] - with PyTessBaseAPI(path=TESSDATA_PREFIX, psm=PSM.AUTO_OSD) as tessapi: - for (n, input_file) in enumerate(self.input_files): - log.info("INPUT FILE %i / %s", n, input_file) + with PyTessBaseAPI( + path=TESSDATA_PREFIX, + lang="osd", # osd required for legacy init! + oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD! + psm=PSM.AUTO_OSD + ) as tessapi: + for n, input_file in enumerate(self.input_files): + file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) - metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", - name=OCRD_TOOL['tools']['ocrd-tesserocr-deskew']['steps'][0], - value='ocrd-tesserocr-deskew', - Labels=[LabelsType(externalRef="parameters", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) - log.info("Deskewing on '%s' level on page '%s'", oplevel, pcgts.get_pcGtsId()) + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + LOG.info("Deskewing on '%s' level in page '%s'", oplevel, page_id) + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) if oplevel == 'page': - self._process_page(tessapi, pil_image) - elif oplevel == 'region': - regions = pcgts.get_Page().get_TextRegion() + self._process_segment(tessapi, page, page_image, page_xywh, + "page '%s'" % page_id, input_file.pageId, + file_id) + else: + regions = page.get_TextRegion() + page.get_TableRegion() if not regions: - log.warning("Deskewing regions requested but page contains no text regions") - self._process_regions(regions, tessapi, pil_image) - - ID = concat_padded(self.output_file_grp, n) + LOG.warning("Page '%s' contains no text regions", page_id) + for region in regions: + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) + self._process_segment(tessapi, region, region_image, region_xywh, + "region '%s'" % region.id, input_file.pageId, + file_id + '_' + region.id) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, - local_filename='%s/%s' % (self.output_file_grp, ID), - content=to_xml(pcgts), - ) - - def _process_page(self, tessapi, pil_image): - tessapi.SetImage(pil_image) - orientation, direction, order, deskew_angle = tessapi.AnalyseLayout().Orientation() - log.debug("Deskew angle: {:.4f}".format(deskew_angle)) - - def _process_regions(self, regions, tessapi, pil_image): - for region in regions: - log.debug("Deskewing region '%s'", region.id) - region_xywh = xywh_from_points(region.get_Coords().points) - - # Note: we set the image instead of specifying a rectangle! - pil_region_image = pil_image.crop((region_xywh['x'], region_xywh['y'], region_xywh['x'] + region_xywh['w'], region_xywh['y'] + region_xywh['h'])) - tessapi.SetImage(pil_region_image) - - orientation, direction, order, deskew_angle = tessapi.AnalyseLayout().Orientation() - log.debug("Deskew angle: {:.4f}".format(deskew_angle)) - region.set_orientation(deskew_angle) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) + + def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_id): + comments = 'cropped' + angle = 0. + tessapi.SetImage(image) + #tessapi.SetPageSegMode(PSM.AUTO_OSD) + # + # orientation/script + # + osr = tessapi.DetectOrientationScript() + if osr: + assert osr['orient_conf'] and not math.isnan(osr['orient_conf']), \ + "orientation detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)" + if osr['orient_conf'] < 10: + LOG.info('ignoring OSD orientation result %d° due to low confidence %.0f in %s', + osr['orient_deg'], osr['orient_conf'], where) + else: + LOG.info('applying OSD orientation result %d° with high confidence %.0f in %s', + osr['orient_deg'], osr['orient_conf'], where) + angle = osr['orient_deg'] + if angle: + comments += ',rotated-%d' % angle + assert osr['script_conf'] and not math.isnan(osr['script_conf']), \ + "script detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)" + if osr['script_conf'] < 10: + LOG.info('ignoring OSD script result "%s" due to low confidence %.0f in %s', + osr['script_name'], osr['script_conf'], where) + else: + LOG.info('applying OSD script result "%s" with high confidence %.0f in %s', + osr['script_name'], osr['script_conf'], where) + segment.set_primaryScript(osr['script_name']) + else: + LOG.warning('no OSD result in %s', where) + # + # orientation/skew + # + layout = tessapi.AnalyseLayout() + if layout: + orientation, writing_direction, textline_order, deskew_angle = layout.Orientation() + LOG.info('orientation/deskewing for %s: %s / %s / %s / %.3f', where, + membername(Orientation, orientation), + membername(WritingDirection, writing_direction), + membername(TextlineOrder, textline_order), + deskew_angle) + # clockwise rotation, as defined in Tesseract OrientationIdToValue: + angle2 = { + Orientation.PAGE_RIGHT: 270, + Orientation.PAGE_DOWN: 180, + Orientation.PAGE_LEFT: 90 + }.get(orientation, 0) + if angle2 != angle: + LOG.warning('inconsistent angles from layout analysis (%d) and orientation detection (%d) in %s', + angle2, angle, where) + deskew_angle *= - 180 / math.pi + if int(deskew_angle): + comments += ',deskewed' + # if angle: + # image = image.transpose({ + # 90: Image.ROTATE_90, + # 180: Image.ROTATE_180, + # 270: Image.ROTATE_270 + # }.get(angle)) # no default + # angle += deskew_angle + if angle: + # Tesseract layout analysis already rotates the image, even for each + # sub-segment (depending on RIL), but the accuracy is not as good + # as setting the image to the sub-segments and running without iterator. + # (These images can be queried via GetBinaryImage/GetImage, cf. segment_region) + # Unfortunately, it does _not_ use expand=True, but chops off corners. + # So we must do it here from the original image ourself: + image = image.rotate(-angle, expand=True, fillcolor='white') + angle = 180 - (180 - angle) % 360 # map to [-179.999,180] + # FIXME: remove that condition as soon as PAGE has orientation on PageType: + if not isinstance(segment, PageType): + segment.set_orientation(angle) + if isinstance(segment, (TextRegionType, PageType)): + segment.set_readingDirection({ + WritingDirection.LEFT_TO_RIGHT: 'left-to-right', + WritingDirection.RIGHT_TO_LEFT: 'right-to-left', + WritingDirection.TOP_TO_BOTTOM: 'top-to-bottom' + }.get(writing_direction, 'bottom-to-top')) + segment.set_textLineOrder({ + TextlineOrder.LEFT_TO_RIGHT: 'left-to-right', + TextlineOrder.RIGHT_TO_LEFT: 'right-to-left', + TextlineOrder.TOP_TO_BOTTOM: 'top-to-bottom' + }.get(textline_order, 'bottom-to-top')) + # baseline = layout.Baseline(RIL.BLOCK) + # if baseline: + # points = points_from_x0y0x1y1(list(baseline[0]) + list(baseline[1])) + # segment.add_Baseline(BaselineType(points=points)) + # update METS (add the image file): + file_path = save_image_file(self.workspace, image, + file_id, + page_id=page_id, + file_grp=FILEGRP_IMG) + # update PAGE (reference the image file): + segment.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments=comments)) diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index 59f12ac..72418f1 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -6,13 +6,12 @@ "ocrd-tesserocr-deskew": { "executable": "ocrd-tesserocr-deskew", "categories": ["Deskewing"], - "description": "Deskew images or image parts", + "description": "Deskew pages or regions", "input_file_grp": [ "OCR-D-IMG", "OCR-D-SEG-BLOCK" ], "output_file_grp": [ - "OCR-D-DESKEW-IMG", "OCR-D-DESKEW-BLOCK" ], "steps": ["preprocessing/optimization/deskewing"], @@ -20,8 +19,8 @@ "operation_level": { "type": "string", "enum": ["page","region"], - "default": "page", - "description": "Level of operation for deskewing" + "default": "region", + "description": "PAGE XML hierarchy level to operate on" } } }, @@ -68,7 +67,23 @@ "OCR-D-SEG-BLOCK" ], "steps": ["layout/segmentation/region"], - "parameters": {} + "parameters": { + "overwrite_regions": { + "type": "boolean", + "default": true, + "description": "remove existing layout and text annotation below the Page level" + }, + "crop_polygons": { + "type": "boolean", + "default": false, + "description": "annotate polygon coordinates instead of rectangles, and create cropped AlternativeImage masked by the polygon outlines" + }, + "find_tables": { + "type": "boolean", + "default": true, + "description": "recognise table regions (textord_tabfind_find_tables)" + } + } }, "ocrd-tesserocr-segment-line": { "executable": "ocrd-tesserocr-segment-line", @@ -81,7 +96,13 @@ "OCR-D-SEG-LINE" ], "steps": ["layout/segmentation/line"], - "parameters": {} + "parameters": { + "overwrite_lines": { + "type": "boolean", + "default": true, + "description": "remove existing layout and text annotation below the TextRegion level" + } + } }, "ocrd-tesserocr-segment-word": { "executable": "ocrd-tesserocr-segment-word", @@ -94,7 +115,13 @@ "OCR-D-SEG-WORD" ], "steps": ["layout/segmentation/word"], - "parameters": {} + "parameters": { + "overwrite_words": { + "type": "boolean", + "default": true, + "description": "remove existing layout and text annotation below the TextLine level" + } + } }, "ocrd-tesserocr-crop": { "executable": "ocrd-tesserocr-crop", @@ -108,6 +135,29 @@ ], "steps": ["preprocessing/optimization/cropping"], "parameters" : {} + }, + "ocrd-tesserocr-binarize": { + "executable": "ocrd-tesserocr-binarize", + "categories": ["Binarization"], + "description": "Binarize pages, regions or lines", + "input_file_grp": [ + "OCR-D-IMG", + "OCR-D-SEG-BLOCK", + "OCR-D-SEG-LINE" + ], + "output_file_grp": [ + "OCR-D-BIN-BLOCK", + "OCR-D-BIN-LINE" + ], + "steps": ["preprocessing/optimization/binarization"], + "parameters": { + "operation_level": { + "type": "string", + "enum": ["page", "region", "line"], + "default": "region", + "description": "PAGE XML hierarchy level to operate on" + } + } } } } diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 86a2603..959200e 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -1,14 +1,14 @@ from __future__ import absolute_import -import math +import os.path from tesserocr import ( RIL, PSM, - PyTessBaseAPI, get_languages, - Orientation, TextlineOrder, WritingDirection) + PyTessBaseAPI, get_languages) from ocrd_utils import ( getLogger, concat_padded, - polygon_from_points, xywh_from_points, points_from_x0y0x1y1, + points_from_x0y0x1y1, + xywh_from_points, points_from_xywh, MIMETYPE_PAGE) from ocrd_models.ocrd_page import ( CoordsType, @@ -19,25 +19,33 @@ to_xml) from ocrd_modelfactory import page_from_file from ocrd import Processor + from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region, + image_from_line, + image_from_word, + image_from_glyph +) -log = getLogger('processor.TesserocrRecognize') +TOOL = 'ocrd-tesserocr-recognize' +LOG = getLogger('processor.TesserocrRecognize') CHOICE_THRESHOLD_NUM = 6 # maximum number of choices to query and annotate CHOICE_THRESHOLD_CONF = 0.2 # maximum score drop from best choice to query and annotate -MAX_ELEMENTS = 500 # maximum number of lower level elements embedded within each element (for word/glyph iterators) class TesserocrRecognize(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-recognize'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrRecognize, self).__init__(*args, **kwargs) def process(self): """Perform OCR recognition with Tesseract on the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested `textequiv_level`. If `overwrite_words` is enabled and any layout annotation below the line level already exists, then remove it @@ -46,10 +54,11 @@ def process(self): the appropriate mode and `model`. Create new elements below the line level if necessary. Put text results and confidence values into new TextEquiv at `textequiv_level`, and make the higher levels consistent - with that (by concatenation joined by whitespace). Produce new output - files by serialising the resulting hierarchy. + with that (by concatenation joined by whitespace). + + Produce new output files by serialising the resulting hierarchy. """ - log.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages()) + LOG.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages()) maxlevel = self.parameter['textequiv_level'] model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: @@ -58,7 +67,8 @@ def process(self): if sub_model not in get_languages()[1]: raise Exception("configured model " + sub_model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: - log.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) + LOG.info("Using model '%s' in %s for recognition at the %s level", + model, get_languages()[0], maxlevel) # todo: populate GetChoiceIterator() with LSTM models, too: #tessapi.SetVariable("lstm_choice_mode", "2") # todo: determine relevancy of these variables: @@ -107,66 +117,60 @@ def process(self): # user_words_file # user_patterns_file for (n, input_file) in enumerate(self.input_files): - log.info("INPUT FILE %i / %s", n, input_file) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - # TODO use binarized / gray - pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) - tessapi.SetImage(pil_image) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", - name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize']['steps'][0], - value='ocrd-tesserocr-recognize', + name=self.ocrd_tool['steps'][0], + value=TOOL, # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this # what we want here is `externalModel="ocrd-tool" externalId="parameters"` Labels=[LabelsType(#externalRef="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) - log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId()) - regions = pcgts.get_Page().get_TextRegion() + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + #tessapi.SetImage(page_image) + LOG.info("Processing page '%s'", page_id) + regions = page.get_TextRegion() if not regions: - log.warning("Page contains no text regions") - self._process_regions(regions, maxlevel, tessapi) + LOG.warning("Page '%s' contains no text regions", page_id) + else: + self._process_regions(tessapi, regions, page_image, page_xywh) page_update_higher_textequiv_levels(maxlevel, pcgts) - ID = concat_padded(self.output_file_grp, n) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename='%s/%s' % (self.output_file_grp, ID), - content=to_xml(pcgts), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) - def _process_regions(self, regions, maxlevel, tessapi): + def _process_regions(self, tessapi, regions, page_image, page_xywh): for region in regions: - log.debug("Recognizing text in region '%s'", region.id) - # todo: determine if and how this can still be used for region classification: - # result_it = tessapi.GetIterator() - # if not result_it or result_it.Empty(RIL.BLOCK) - # ptype = result_it.BlockType() - # PT.UNKNOWN - # PT.FLOWING_TEXT - # PT.HEADING_TEXT - # PT.PULLOUT_TEXT - # PT.EQUATION - # PT.TABLE - # PT.VERTICAL_TEXT - # PT.CAPTION_TEXT - # PT.HORZ_LINE - # PT.VERT_LINE - # PT.NOISE - # PT.COUNT - # ... - if maxlevel == 'region': - region_xywh = xywh_from_points(region.get_Coords().points) - tessapi.SetRectangle(region_xywh['x'], region_xywh['y'], region_xywh['w'], region_xywh['h']) + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) + if self.parameter['textequiv_level'] == 'region': + tessapi.SetImage(region_image) tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) + #if region.get_primaryScript() not in tessapi.GetLoadedLanguages()... + LOG.debug("Recognizing text in region '%s'", region.id) region_text = tessapi.GetUTF8Text().rstrip("\n\f") region_conf = tessapi.MeanTextConf()/100.0 # iterator scores are arithmetic averages, too if region.get_TextEquiv(): - log.warning("Region '%s' already contained text results", region.id) + LOG.warning("Region '%s' already contained text results", region.id) region.set_TextEquiv([]) # todo: consider SetParagraphSeparator region.add_TextEquiv(TextEquivType(Unicode=region_text, conf=region_conf)) @@ -174,24 +178,27 @@ def _process_regions(self, regions, maxlevel, tessapi): ## line, word, or glyph level: textlines = region.get_TextLine() if not textlines: - log.warning("Region '%s' contains no text lines", region.id) + LOG.warning("Region '%s' contains no text lines", region.id) else: - self._process_lines(textlines, maxlevel, tessapi) + self._process_lines(tessapi, textlines, region_image, region_xywh) - def _process_lines(self, textlines, maxlevel, tessapi): + def _process_lines(self, tessapi, textlines, region_image, region_xywh): for line in textlines: if self.parameter['overwrite_words']: line.set_Word([]) - log.debug("Recognizing text in line '%s'", line.id) - line_xywh = xywh_from_points(line.get_Coords().points) - # log.debug("xywh: %s", line_xywh) - tessapi.SetRectangle(line_xywh['x'], line_xywh['y'], line_xywh['w'], line_xywh['h']) - tessapi.SetPageSegMode(PSM.SINGLE_LINE) # RAW_LINE fails with Tesseract 3 models and is worse with Tesseract 4 models - if maxlevel == 'line': + line_image, line_xywh = image_from_line( + self.workspace, line, region_image, region_xywh) + # todo: Tesseract works better if the line images have a 5px margin everywhere + tessapi.SetImage(line_image) + # RAW_LINE fails with pre-LSTM models, but sometimes better with LSTM models + tessapi.SetPageSegMode(PSM.SINGLE_LINE) + #if line.get_primaryScript() not in tessapi.GetLoadedLanguages()... + LOG.debug("Recognizing text in line '%s'", line.id) + if self.parameter['textequiv_level'] == 'line': line_text = tessapi.GetUTF8Text().rstrip("\n\f") line_conf = tessapi.MeanTextConf()/100.0 # iterator scores are arithmetic averages, too if line.get_TextEquiv(): - log.warning("Line '%s' already contained text results", line.id) + LOG.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) # todo: consider BlankBeforeWord, SetLineSeparator line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf)) @@ -200,61 +207,75 @@ def _process_lines(self, textlines, maxlevel, tessapi): words = line.get_Word() if words: ## external word layout: - log.warning("Line '%s' contains words already, recognition might be suboptimal", line.id) - self._process_existing_words(words, maxlevel, tessapi) + LOG.warning("Line '%s' contains words already, recognition might be suboptimal", line.id) + self._process_existing_words(tessapi, words, line_image, line_xywh) else: ## internal word and glyph layout: tessapi.Recognize() - self._process_words_in_line(line, maxlevel, tessapi.GetIterator()) + self._process_words_in_line(tessapi.GetIterator(), line, line_xywh) - def _process_words_in_line(self, line, maxlevel, result_it): - for word_no in range(0, MAX_ELEMENTS): # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD) - if not result_it: - log.error("No iterator at '%s'", line.id) - break - if result_it.Empty(RIL.WORD): - log.warning("No word in line '%s'", line.id) - break + def _process_words_in_line(self, result_it, line, line_xywh): + if not result_it or result_it.Empty(RIL.WORD): + LOG.warning("No text in line '%s'", line.id) + return + # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD): + word_no = 0 + while result_it and not result_it.Empty(RIL.WORD): word_id = '%s_word%04d' % (line.id, word_no) - log.debug("Recognizing text in word '%s'", word_id) - word_bbox = result_it.BoundingBox(RIL.WORD) - word = WordType(id=word_id, Coords=CoordsType(points_from_x0y0x1y1(word_bbox))) + LOG.debug("Decoding text in word '%s'", word_id) + bbox = result_it.BoundingBox(RIL.WORD) + points = points_from_x0y0x1y1(bbox) + # add offset from image: + xywh = xywh_from_points(points) + xywh['x'] += line_xywh['x'] + xywh['y'] += line_xywh['y'] + points = points_from_xywh(xywh) + word = WordType(id=word_id, Coords=CoordsType(points)) line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: - word_style = TextStyleType(fontSize=word_attributes['pointsize'] if 'pointsize' in word_attributes else None, - fontFamily=word_attributes['font_name'] if 'font_name' in word_attributes else None, - bold=None if 'bold' not in word_attributes else word_attributes['bold'], - italic=None if 'italic' not in word_attributes else word_attributes['italic'], - underlined=None if 'underlined' not in word_attributes else word_attributes['underlined'], - monospace=None if 'monospace' not in word_attributes else word_attributes['monospace'], - serif=None if 'serif' not in word_attributes else word_attributes['serif'] - ) + word_style = TextStyleType( + fontSize=word_attributes['pointsize'] + if 'pointsize' in word_attributes else None, + fontFamily=word_attributes['font_name'] + if 'font_name' in word_attributes else None, + bold=word_attributes['bold'] + if 'bold' in word_attributes else None, + italic=word_attributes['italic'] + if 'italic' in word_attributes else None, + underlined=word_attributes['underlined'] + if 'underlined' in word_attributes else None, + monospace=word_attributes['monospace'] + if 'monospace' in word_attributes else None, + serif=word_attributes['serif'] + if 'serif' in word_attributes else None) word.set_TextStyle(word_style) # (or somewhere in custom attribute?) # add word annotation unconditionally (i.e. even for glyph level): - word.add_TextEquiv(TextEquivType(Unicode=result_it.GetUTF8Text(RIL.WORD), conf=result_it.Confidence(RIL.WORD)/100)) - if maxlevel == 'word': - pass - else: - self._process_glyphs_in_word(word, result_it) + word.add_TextEquiv(TextEquivType( + Unicode=result_it.GetUTF8Text(RIL.WORD), + conf=result_it.Confidence(RIL.WORD)/100)) + if self.parameter['textequiv_level'] != 'word': + self._process_glyphs_in_word(result_it, word, xywh) if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break else: + word_no += 1 result_it.Next(RIL.WORD) - def _process_existing_words(self, words, maxlevel, tessapi): + def _process_existing_words(self, tessapi, words, line_image, line_xywh): for word in words: - log.debug("Recognizing text in word '%s'", word.id) - word_xywh = xywh_from_points(word.get_Coords().points) - tessapi.SetRectangle(word_xywh['x'], word_xywh['y'], word_xywh['w'], word_xywh['h']) + word_image, word_xywh = image_from_word( + self.workspace, word, line_image, line_xywh) + tessapi.SetImage(word_image) tessapi.SetPageSegMode(PSM.SINGLE_WORD) - if maxlevel == 'word': + if self.parameter['textequiv_level'] == 'word': + LOG.debug("Recognizing text in word '%s'", word.id) word_text = tessapi.GetUTF8Text().rstrip("\n\f") word_conf = tessapi.AllWordConfidences() word_conf = word_conf[0]/100.0 if word_conf else 0.0 if word.get_TextEquiv(): - log.warning("Word '%s' already contained text results", word.id) + LOG.warning("Word '%s' already contained text results", word.id) word.set_TextEquiv([]) # todo: consider WordFontAttributes (TextStyle) etc (if not word.get_TextStyle()) word.add_TextEquiv(TextEquivType(Unicode=word_text, conf=word_conf)) @@ -263,62 +284,68 @@ def _process_existing_words(self, words, maxlevel, tessapi): glyphs = word.get_Glyph() if glyphs: ## external glyph layout: - log.warning("Word '%s' contains glyphs already, recognition might be suboptimal", word.id) - self._process_existing_glyphs(glyphs, tessapi) + LOG.warning("Word '%s' contains glyphs already, recognition might be suboptimal", word.id) + self._process_existing_glyphs(tessapi, glyphs, word_image, word_xywh) else: ## internal glyph layout: tessapi.Recognize() - self._process_glyphs_in_word(word, tessapi.GetIterator()) + self._process_glyphs_in_word(tessapi.GetIterator(), word, word_xywh) - def _process_existing_glyphs(self, glyphs, tessapi): + def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh): for glyph in glyphs: - log.debug("Recognizing glyph in word '%s'", glyph.id) - glyph_xywh = xywh_from_points(glyph.get_Coords().points) - tessapi.SetRectangle(glyph_xywh['x'], glyph_xywh['y'], glyph_xywh['w'], glyph_xywh['h']) + glyph_image, glyph_xywh = image_from_glyph( + self.workspace, glyph, word_image, word_xywh) + tessapi.SetImage(glyph_image) tessapi.SetPageSegMode(PSM.SINGLE_CHAR) + LOG.debug("Recognizing text in glyph '%s'", glyph.id) if glyph.get_TextEquiv(): - log.warning("Glyph '%s' already contained text results", glyph.id) + LOG.warning("Glyph '%s' already contained text results", glyph.id) glyph.set_TextEquiv([]) #glyph_text = tessapi.GetUTF8Text().rstrip("\n\f") glyph_conf = tessapi.AllWordConfidences() glyph_conf = glyph_conf[0]/100.0 if glyph_conf else 0.0 - #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) + #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) result_it = tessapi.GetIterator() if not result_it or result_it.Empty(RIL.SYMBOL): - log.error("No glyph here") + LOG.error("No text in glyph '%s'", glyph.id) continue choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 - #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) + #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) - def _process_glyphs_in_word(self, word, result_it): - for glyph_no in range(0, MAX_ELEMENTS): # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL) - if not result_it: - log.error("No iterator at '%s'", word.id) - break - if result_it.Empty(RIL.SYMBOL): - log.debug("No glyph here") - break + def _process_glyphs_in_word(self, result_it, word, word_xywh): + if not result_it or result_it.Empty(RIL.SYMBOL): + LOG.debug("No glyph in word '%s'", word.id) + return + # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL): + glyph_no = 0 + while result_it and not result_it.Empty(RIL.SYMBOL): glyph_id = '%s_glyph%04d' % (word.id, glyph_no) - log.debug("Recognizing text in glyph '%s'", glyph_id) + LOG.debug("Decoding text in glyph '%s'", glyph_id) # glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice? glyph_conf = result_it.Confidence(RIL.SYMBOL)/100 # equals first choice? - #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) - glyph_bbox = result_it.BoundingBox(RIL.SYMBOL) - glyph = GlyphType(id=glyph_id, Coords=CoordsType(points_from_x0y0x1y1(glyph_bbox))) + #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) + bbox = result_it.BoundingBox(RIL.SYMBOL) + points = points_from_x0y0x1y1(bbox) + # add offset from image: + xywh = xywh_from_points(points) + xywh['x'] += word_xywh['x'] + xywh['y'] += word_xywh['y'] + points = points_from_xywh(xywh) + glyph = GlyphType(id=glyph_id, Coords=CoordsType(points)) word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 - #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) + #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break @@ -327,6 +354,7 @@ def _process_glyphs_in_word(self, word, result_it): if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL): break else: + glyph_no += 1 result_it.Next(RIL.SYMBOL) def page_update_higher_textequiv_levels(level, pcgts): diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 894c5c7..f2955f1 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -1,53 +1,109 @@ from __future__ import absolute_import -from tesserocr import PyTessBaseAPI, RIL + +import os.path +from tesserocr import PyTessBaseAPI, RIL, PSM + from ocrd import Processor -from ocrd_utils import getLogger, concat_padded, points_from_xywh, polygon_from_points, xywh_from_points, MIMETYPE_PAGE +from ocrd_utils import ( + getLogger, concat_padded, + points_from_xywh, + MIMETYPE_PAGE +) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, + LabelType, LabelsType, + MetadataItemType, TextLineType, - to_xml ) -from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region +) -log = getLogger('processor.TesserocrSegmentLine') +TOOL = 'ocrd-tesserocr-segment-line' +LOG = getLogger('processor.TesserocrSegmentLine') class TesserocrSegmentLine(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-line'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentLine, self).__init__(*args, **kwargs) def process(self): + """Performs (text) line segmentation with Tesseract on the workspace. + + Open and deserialise PAGE input files and their respective images, + then iterate over the element hierarchy down to the region level, + and remove any existing TextLine elements (unless `overwrite_lines` + is False). + + Set up Tesseract to detect lines, and add each one to the region + at the detected coordinates. + + Produce a new output file by serialising the resulting hierarchy. """ - Performs the line segmentation. - """ - with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: + overwrite_lines = self.parameter['overwrite_lines'] + + with PyTessBaseAPI( + psm=PSM.SINGLE_BLOCK, + path=TESSDATA_PREFIX + ) as tessapi: for (n, input_file) in enumerate(self.input_files): + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - image_url = pcgts.get_Page().imageFilename - for region in pcgts.get_Page().get_TextRegion(): - log.debug("Detecting lines in %s with tesseract", region.id) - image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(region.get_Coords().points)) - tessapi.SetImage(image) - offset = xywh_from_points(region.get_Coords().points) - for (line_no, component) in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True)): + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + + for region in page.get_TextRegion(): + if region.get_TextLine(): + if overwrite_lines: + LOG.info('removing existing TextLines in region "%s"', region.id) + region.set_TextLine([]) + else: + LOG.warning('keeping existing TextLines in region "%s"', region.id) + LOG.debug("Detecting lines in region '%s'", region.id) + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) + tessapi.SetImage(region_image) + for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] - line_xywh['x'] += offset['x'] - line_xywh['y'] += offset['y'] + line_xywh['x'] += region_xywh['x'] + line_xywh['y'] += region_xywh['y'] line_points = points_from_xywh(line_xywh) - region.add_TextLine(TextLineType(id=line_id, Coords=CoordsType(line_points))) - ID = concat_padded(self.output_file_grp, n) + region.add_TextLine(TextLineType( + id=line_id, Coords=CoordsType(line_points))) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename='%s/%s' % (self.output_file_grp, ID), - content=to_xml(pcgts).encode('utf-8'), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index c8df11c..ff14e89 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -1,9 +1,21 @@ from __future__ import absolute_import -import tesserocr -from ocrd_utils import getLogger, concat_padded, points_from_x0y0x1y1, xywh_from_points, MIMETYPE_PAGE + +import os.path +from tesserocr import ( + PyTessBaseAPI, + PSM, RIL, PT +) + +from ocrd_utils import ( + getLogger, concat_padded, + points_from_x0y0x1y1, + points_from_xywh, xywh_from_points, + MIMETYPE_PAGE) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - CoordsType, + MetadataItemType, + LabelsType, LabelType, + CoordsType, AlternativeImageType, OrderedGroupType, ReadingOrderType, RegionRefIndexedType, @@ -12,90 +24,242 @@ MathsRegionType, SeparatorRegionType, NoiseRegionType, + to_xml) +from ocrd_models.ocrd_page_generateds import TableRegionType +from ocrd import Processor - to_xml +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + save_image_file, + membername ) -from ocrd import Processor -from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +TOOL = 'ocrd-tesserocr-segment-region' +LOG = getLogger('processor.TesserocrSegmentRegion') +FILEGRP_IMG = 'OCR-D-IMG-CROP' -log = getLogger('processor.TesserocrSegmentRegion') +PADDING = 8 # extend detected region rectangles by how many (true) pixels? +# (will be passed as padding to both BoundingBox and GetImage) +# (actually, Tesseract honours padding only on the left and bottom, +# whereas right and top are increased less) class TesserocrSegmentRegion(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-region'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentRegion, self).__init__(*args, **kwargs) def process(self): + """Performs (text) region segmentation with Tesseract on the workspace. + + Open and deserialise PAGE input files and their respective images, + and remove any existing Region and ReadingOrder elements + (unless `overwrite_regions` is False). + + Set up Tesseract to detect blocks, and add each one to the page + as a region according to BlockType at the detected coordinates. + If `find_tables` is True, try to detect table blocks and add them + as (atomic) TableRegion. + + If `crop_polygons` is True, create a cropped (and possibly deskewed) + raw image file for each region (masked along its polygon outline), + and reference it as AlternativeImage in the region element and + as file with a fileGrp USE equal `OCR-D-IMG-CROP` in the workspace. + + Produce a new output file by serialising the resulting hierarchy. """ - Performs the region segmentation. - """ - with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: - # print(self.input_file_grp) + overwrite_regions = self.parameter['overwrite_regions'] + find_tables = self.parameter['find_tables'] + + with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: + if not find_tables: + # disable table detection here, so tables will be + # analysed as independent text/line blocks: + tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): + file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) - log.debug("Detecting regions with tesseract") - tessapi.SetImage(image) - # respect border element if present - if pcgts.get_Page().get_Border() is not None and pcgts.get_Page().get_Border().get_Coords() is not None: - border = xywh_from_points(pcgts.get_Page().get_Border().get_Coords().points) - log.debug("Explictly set page border at %s", pcgts.get_Page().get_Border().get_Coords().points) - tessapi.SetRectangle(border['x'], border['y'], border['w'], border['h']) - - # recognize the layout and the region types - it = tessapi.AnalyseLayout() - index = 0 - while it and not it.Empty(tesserocr.RIL.BLOCK): - points = points_from_x0y0x1y1(it.BoundingBox(tesserocr.RIL.BLOCK)) - - # - # the region reference in the reading order element - # - ID = "region%04d" % index - log.debug("Detected region '%s': %s", ID, points) - # - ro = pcgts.get_Page().get_ReadingOrder() - if ro is None: - ro = ReadingOrderType() - pcgts.get_Page().set_ReadingOrder(ro) - # - og = ro.get_OrderedGroup() - if og is None: - og = OrderedGroupType(id="reading-order") - ro.set_OrderedGroup(og) - # - og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index)) - - # - # region switch - # - block_type = it.BlockType() - if block_type in [tesserocr.PT.FLOWING_TEXT, tesserocr.PT.HEADING_TEXT, tesserocr.PT.PULLOUT_TEXT]: - pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points))) - elif block_type in [tesserocr.PT.FLOWING_IMAGE, tesserocr.PT.HEADING_IMAGE, tesserocr.PT.PULLOUT_IMAGE]: - pcgts.get_Page().add_ImageRegion(ImageRegionType(id=ID, Coords=CoordsType(points=points))) - elif block_type in [tesserocr.PT.HORZ_LINE, tesserocr.PT.VERT_LINE]: - pcgts.get_Page().add_SeparatorRegion(SeparatorRegionType(id=ID, Coords=CoordsType(points=points))) - elif block_type in [tesserocr.PT.INLINE_EQUATION, tesserocr.PT.EQUATION]: - pcgts.get_Page().add_MathsRegion(MathsRegionType(id=ID, Coords=CoordsType(points=points))) + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + if page.get_TextRegion(): + if overwrite_regions: + LOG.info('removing existing TextRegions') + page.set_TextRegion([]) else: - pcgts.get_Page().add_NoiseRegion(NoiseRegionType(id=ID, Coords=CoordsType(points=points))) - - # - # iterator increment - # - index += 1 - it.Next(tesserocr.RIL.BLOCK) + LOG.warning('keeping existing TextRegions') + # todo: also make non-text regions protected? + page.set_AdvertRegion([]) + page.set_ChartRegion([]) + page.set_ChemRegion([]) + page.set_GraphicRegion([]) + page.set_ImageRegion([]) + page.set_LineDrawingRegion([]) + page.set_MathsRegion([]) + page.set_MusicRegion([]) + page.set_NoiseRegion([]) + page.set_SeparatorRegion([]) + page.set_TableRegion([]) + page.set_UnknownRegion([]) + if page.get_ReadingOrder(): + if overwrite_regions: + LOG.info('overwriting existing ReadingOrder') + # (cannot sustain old regionrefs) + page.set_ReadingOrder([]) + else: + LOG.warning('keeping existing ReadingOrder') + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + LOG.info("Detecting regions in page '%s'", page_id) + tessapi.SetImage(page_image) # is already cropped to Border + tessapi.SetPageSegMode(PSM.AUTO) # (default) - ID = concat_padded(self.output_file_grp, n) + # detect the region segments and types: + layout = tessapi.AnalyseLayout() + self._process_page(layout, page, page_image, page_xywh, input_file.pageId, file_id) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename='%s/%s' % (self.output_file_grp, ID), - content=to_xml(pcgts).encode('utf-8'), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) + + def _process_page(self, it, page, page_image, page_xywh, page_id, file_id): + # equivalent to GetComponentImages with raw_image=True, + # (which would also give raw coordinates), + # except we are also interested in the iterator's BlockType() here, + # and its BlockPolygon() + index = 0 + while it and not it.Empty(RIL.BLOCK): + bbox = it.BoundingBox(RIL.BLOCK, padding=PADDING) + points = points_from_x0y0x1y1(bbox) + # add offset from any Border: + xywh = xywh_from_points(points) + xywh['x'] += page_xywh['x'] + xywh['y'] += page_xywh['y'] + points = points_from_xywh(xywh) + # this crashes due to tesserocr issue #184 (fixed in PR #185); + # also, sometimes these polygons are not planar (probably a + # bug in Tesseract itself): + # TODO: uncomment as soon as a merged tesserocr release is out: + # polygon = it.BlockPolygon() + # if self.parameter['crop_polygons'] and polygon and list(polygon): + # # add offset from any Border, and + # # avoid negative results (invalid in PAGE): + # polygon = [(max(0, x + page_xywh['x']), + # max(0, y + page_xywh['y'])) + # for x, y in polygon] + # points = points_from_polygon(polygon) + coords = CoordsType(points=points) + # if xywh['w'] < 30 or xywh['h'] < 30: + # LOG.info('Ignoring too small region: %s', points) + # it.Next(RIL.BLOCK) + # continue + # region_image_bin = it.GetBinaryImage(RIL.BLOCK) + # if not region_image_bin.getbbox(): + # LOG.info('Ignoring binary-empty region: %s', points) + # it.Next(RIL.BLOCK) + # continue + # + # the region reference in the reading order element + # + ID = "region%04d" % index + ro = page.get_ReadingOrder() + if not ro: + ro = ReadingOrderType() + page.set_ReadingOrder(ro) + og = ro.get_OrderedGroup() + if not og: + og = OrderedGroupType(id="reading-order") + ro.set_OrderedGroup(og) + og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index)) + # + # region type switch + # + block_type = it.BlockType() + if block_type in [PT.FLOWING_TEXT, + PT.HEADING_TEXT, + PT.PULLOUT_TEXT, + PT.CAPTION_TEXT, + # TABLE is contained in PTIsTextType, but + # it is a bad idea to create a TextRegion + # for it (better set `find_tables` False): + # PT.TABLE, + # will always yield a 90° deskew angle below: + PT.VERTICAL_TEXT]: + region = TextRegionType(id=ID, Coords=coords) + page.add_TextRegion(region) + elif block_type in [PT.FLOWING_IMAGE, + PT.HEADING_IMAGE, + PT.PULLOUT_IMAGE]: + region = ImageRegionType(id=ID, Coords=coords) + page.add_ImageRegion(region) + elif block_type in [PT.HORZ_LINE, + PT.VERT_LINE]: + region = SeparatorRegionType(id=ID, Coords=coords) + page.add_SeparatorRegion(region) + elif block_type in [PT.INLINE_EQUATION, + PT.EQUATION]: + region = MathsRegionType(id=ID, Coords=coords) + page.add_MathsRegion(region) + elif block_type == PT.TABLE: + # without API access to StructuredTable we cannot + # do much for a TableRegionType (i.e. nrows, ncols, + # coordinates of cells for recursive regions etc), + # but this could be achieved later by a specialised + # processor + region = TableRegionType(id=ID, Coords=coords) + page.add_TableRegion(region) + else: + region = NoiseRegionType(id=ID, Coords=coords) + page.add_NoiseRegion() + LOG.info("Detected region '%s': %s (%s)", ID, points, membername(PT, block_type)) + if self.parameter['crop_polygons']: + # Store the cropped (and deskewed) image for the region, + # this is not always preferable, because Tesseract tends + # to produce polygon outlines that are worse than the + # enclosing bounding boxes, and these are always used + # as mask for the image (see above). Also, it chops off + # corners when rotating against the recognised skew. + # Moreover, the mix of colour and white background + # in these images might cause binarization trouble. + # (Although against the latter we could switch to + # GetBinaryImage). + # You have been warned! + # get the raw image (masked by white space along the block polygon): + region_image, top, left = it.GetImage(RIL.BLOCK, PADDING, page_image) + # update METS (add the image file): + file_path = save_image_file(self.workspace, region_image, + file_id + '_' + ID, + page_id=page_id, + file_grp=FILEGRP_IMG) + # update PAGE (reference the image file): + region.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments="cropped")) + # + # iterator increment + # + index += 1 + it.Next(RIL.BLOCK) diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 8b8c8d7..a74a168 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -1,54 +1,112 @@ from __future__ import absolute_import + +import os.path from tesserocr import RIL, PyTessBaseAPI, PSM + from ocrd import Processor -from ocrd_utils import getLogger, concat_padded, points_from_xywh, polygon_from_points, xywh_from_points, MIMETYPE_PAGE +from ocrd_utils import ( + getLogger, concat_padded, + points_from_xywh, + MIMETYPE_PAGE +) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, + LabelType, LabelsType, + MetadataItemType, WordType, to_xml ) from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region, + image_from_line +) -log = getLogger('processor.TesserocrSegmentWord') +TOOL = 'ocrd-tesserocr-segment-word' +LOG = getLogger('processor.TesserocrSegmentWord') class TesserocrSegmentWord(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-word'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentWord, self).__init__(*args, **kwargs) def process(self): + """Performs word segmentation with Tesseract on the workspace. + + Open and deserialise PAGE input files and their respective images, + then iterate over the element hierarchy down to the textline level, + and remove any existing Word elements (unless `overwrite_words` + is False). + + Set up Tesseract to detect words, and add each one to the line + at the detected coordinates. + + Produce a new output file by serialising the resulting hierarchy. """ - Performs the line segmentation. - """ + overwrite_words = self.parameter['overwrite_words'] + with PyTessBaseAPI( psm=PSM.SINGLE_LINE, - path=TESSDATA_PREFIX, + path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - image_url = pcgts.get_Page().imageFilename - for region in pcgts.get_Page().get_TextRegion(): + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + + for region in page.get_TextRegion(): + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) for line in region.get_TextLine(): - log.debug("Detecting words in line '%s'", line.id) - image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) - tessapi.SetImage(image) - offset = xywh_from_points(line.get_Coords().points) - for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)): + if line.get_Word(): + if overwrite_words: + LOG.info('removing existing Words in line "%s"', line.id) + line.set_Word([]) + else: + LOG.warning('keeping existing Words in line "%s"', line.id) + LOG.debug("Detecting words in line '%s'", line.id) + line_image, line_xywh = image_from_line( + self.workspace, line, region_image, region_xywh) + tessapi.SetImage(line_image) + for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] - word_xywh['x'] += offset['x'] - word_xywh['y'] += offset['y'] - line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh)))) - ID = concat_padded(self.output_file_grp, n) + word_xywh['x'] += line_xywh['x'] + word_xywh['y'] += line_xywh['y'] + word_points = points_from_xywh(word_xywh) + line.add_Word(WordType( + id=word_id, Coords=CoordsType(word_points))) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, - local_filename='%s/%s' % (self.output_file_grp, ID), mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts).encode('utf-8'), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) diff --git a/setup.py b/setup.py index 2acf4bf..4a116f4 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,7 @@ - ocrd_tesserocr_segment_word - ocrd_tesserocr_crop - ocrd_tesserocr_deskew + - ocrd_tesserocr_binarize """ import codecs @@ -15,7 +16,7 @@ setup( name='ocrd_tesserocr', - version='0.2.2', + version='0.2.3', description='Tesserocr bindings', long_description=codecs.open('README.rst', encoding='utf-8').read(), author='Konstantin Baierer, Kay-Michael Würzner', @@ -35,6 +36,7 @@ 'ocrd-tesserocr-segment-word=ocrd_tesserocr.cli:ocrd_tesserocr_segment_word', 'ocrd-tesserocr-crop=ocrd_tesserocr.cli:ocrd_tesserocr_crop', 'ocrd-tesserocr-deskew=ocrd_tesserocr.cli:ocrd_tesserocr_deskew', + 'ocrd-tesserocr-binarize=ocrd_tesserocr.cli:ocrd_tesserocr_binarize', ] }, ) From 3a4b8e6399f45430080b2676a1686f828c548236 Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Wed, 3 Jul 2019 12:08:45 +0200 Subject: [PATCH 3/6] change new version --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 4a116f4..5048ff3 100644 --- a/setup.py +++ b/setup.py @@ -16,11 +16,11 @@ setup( name='ocrd_tesserocr', - version='0.2.3', + version='0.3.0', description='Tesserocr bindings', long_description=codecs.open('README.rst', encoding='utf-8').read(), - author='Konstantin Baierer, Kay-Michael Würzner', - author_email='unixprog@gmail.com, wuerzner@gmail.com', + author='Konstantin Baierer, Kay-Michael Würzner, Robert Sachunsky', + author_email='unixprog@gmail.com, wuerzner@gmail.com, sachunsky@informatik.uni-leipzig.de', url='https://github.com/OCR-D/ocrd_tesserocr', license='Apache License 2.0', packages=find_packages(exclude=('tests', 'docs')), From 6eaba4a71dc476c87d61c5bd69ff0127ab8a4a19 Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Wed, 3 Jul 2019 12:09:39 +0200 Subject: [PATCH 4/6] remove resegmentation from common --- ocrd_tesserocr/common.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ocrd_tesserocr/common.py b/ocrd_tesserocr/common.py index 57b391b..6aa4c1a 100644 --- a/ocrd_tesserocr/common.py +++ b/ocrd_tesserocr/common.py @@ -11,10 +11,6 @@ LOG = getLogger('') # to be refined by importer -# dummy (not available without ocrolib) -def resegment(mask_image, labels): - return mask_image - # to be refactored into core (as function in ocrd_utils): def polygon_mask(image, coordinates): mask = Image.new('L', image.size, 0) @@ -207,10 +203,6 @@ def image_from_line(workspace, line, orig={'x': 0.5 * region_image.width, 'y': 0.5 * region_image.height}) line_mask = polygon_mask(region_image, line_polygon) - if isinstance(segmentation, np.ndarray): - # modify mask from (ad-hoc) line segmentation of region - # (shrink to largest label spread in that area): - line_mask = resegment(line_mask, segmentation) # create a background image from its median color # (in case it has not been binarized yet): region_array = np.asarray(region_image) From 8f6b94d22e86ecd206c52b2ca6e981ef5c69dd9f Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Wed, 3 Jul 2019 12:09:52 +0200 Subject: [PATCH 5/6] expose parameters, improve docstrings --- .pylintrc | 1 - ocrd_tesserocr/binarize.py | 4 ++-- ocrd_tesserocr/crop.py | 16 ++++++++-------- ocrd_tesserocr/deskew.py | 16 ++++++++++++---- ocrd_tesserocr/ocrd-tool.json | 19 ++++++++++++++++--- ocrd_tesserocr/recognize.py | 2 +- ocrd_tesserocr/segment_line.py | 2 +- ocrd_tesserocr/segment_region.py | 7 +++---- ocrd_tesserocr/segment_word.py | 2 +- 9 files changed, 44 insertions(+), 25 deletions(-) diff --git a/.pylintrc b/.pylintrc index dda4d9e..710b8b2 100644 --- a/.pylintrc +++ b/.pylintrc @@ -5,7 +5,6 @@ ignored-modules=cv2,tesserocr [MESSAGES CONTROL] disable = ungrouped-imports, -# fixme, bad-continuation, missing-docstring, no-self-use, diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index 954279f..494c0bc 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -43,10 +43,10 @@ def __init__(self, *args, **kwargs): def process(self): """Performs binarization with Tesseract on the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the requested level. - Set up Tesseract to recognise the segment image's layout, and get + Set up Tesseract to recognize the segment image's layout, and get the binarized image. Create an image file, and reference it as AlternativeImage in the element and as file with a fileGrp USE equal `OCR-D-IMG-BIN` in the workspace. diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index fca0a55..3edee91 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -25,8 +25,6 @@ TOOL = 'ocrd-tesserocr-crop' LOG = getLogger('processor.TesserocrCrop') -PADDING = 4 # extend detected border by how many (true) pixels on every side? - class TesserocrCrop(Processor): def __init__(self, *args, **kwargs): @@ -35,15 +33,17 @@ def __init__(self, *args, **kwargs): super(TesserocrCrop, self).__init__(*args, **kwargs) def process(self): - """Performs crude page cropping with Tesseract on the workspace. + """Performs page cropping with Tesseract on the workspace. - Open and deserialise PAGE input files and their respective images. + Open and deserialize PAGE input files and their respective images. Set up Tesseract to detect text blocks on each page, and find the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. Produce new output files by serialising the resulting hierarchy. """ + padding = self.parameter['padding'] + with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here (tables count as text blocks), # because we do not want to risk confusing the spine with @@ -131,10 +131,10 @@ def process(self): # if min_x < max_x and min_y < max_y: # add padding: - min_x = max(min_x - PADDING, 0) - max_x = min(max_x + PADDING, page_image.width) - min_y = max(min_y - PADDING, 0) - max_y = min(max_y + PADDING, page_image.height) + min_x = max(min_x - padding, 0) + max_x = min(max_x + padding, page_image.width) + min_y = max(min_y - padding, 0) + max_y = min(max_y + padding, page_image.height) LOG.debug("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) border = BorderType(Coords=CoordsType( points_from_bbox(min_x, min_y, max_x, max_y))) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index 89be7b6..9be2335 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -44,7 +44,7 @@ def __init__(self, *args, **kwargs): super(TesserocrDeskew, self).__init__(*args, **kwargs) def process(self): - """Performs region-level deskewing with Tesseract on the workspace. + """Performs deskewing of the page / region with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the region level @@ -54,9 +54,9 @@ def process(self): and script (with both OSD and AnalyseLayout). Rotate the image accordingly, and annotate the angle, readingDirection and textlineOrder. - Create a cropped (and possibly deskewed) image file, and reference it - as AlternativeImage in the region element and as file with a fileGrp USE - equal `OCR-D-IMG-DESKEW` in the workspace. + Create a corresponding image file, and reference it as AlternativeImage + in the region element and as file with a fileGrp USE `OCR-D-IMG-DESKEW` + in the workspace. Produce a new output file by serialising the resulting hierarchy. """ @@ -172,6 +172,14 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i deskew_angle *= - 180 / math.pi if int(deskew_angle): comments += ',deskewed' + # We could rotate the image by transposition (which is more accurate + # than the general method below), but then the coordinates – + # which are still relative to `imageFilename` – of all the elements + # contained in this segment (i.e. any TextLine if `segment` is TextRegion, + # and any TextRegion if `segment` is Page) will have to be _transposed_ + # (instead of rotated) as well. But PAGE consumers have little chance + # of knowing which method producers chose, so here we generally decide + # to drop this mechanism: # if angle: # image = image.transpose({ # 90: Image.ROTATE_90, diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index 72418f1..31e756a 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -5,7 +5,7 @@ "tools": { "ocrd-tesserocr-deskew": { "executable": "ocrd-tesserocr-deskew", - "categories": ["Deskewing"], + "categories": ["Image preprocessing"], "description": "Deskew pages or regions", "input_file_grp": [ "OCR-D-IMG", @@ -73,6 +73,12 @@ "default": true, "description": "remove existing layout and text annotation below the Page level" }, + "padding": { + "type": "number", + "format": "integer", + "description": "extend detected region rectangles by this many (true) pixels", + "default": 8 + }, "crop_polygons": { "type": "boolean", "default": false, @@ -134,11 +140,18 @@ "OCR-D-IMG-CROPPED" ], "steps": ["preprocessing/optimization/cropping"], - "parameters" : {} + "parameters" : { + "padding": { + "type": "number", + "format": "integer", + "description": "extend detected border by this many (true) pixels on every side", + "default": 4 + } + } }, "ocrd-tesserocr-binarize": { "executable": "ocrd-tesserocr-binarize", - "categories": ["Binarization"], + "categories": ["Image preprocessing"], "description": "Binarize pages, regions or lines", "input_file_grp": [ "OCR-D-IMG", diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 959200e..b7699d3 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -345,7 +345,7 @@ def _process_glyphs_in_word(self, result_it, word, word_xywh): for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 - #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) + LOG.trace('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index f2955f1..4890237 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -38,7 +38,7 @@ def __init__(self, *args, **kwargs): def process(self): """Performs (text) line segmentation with Tesseract on the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level, and remove any existing TextLine elements (unless `overwrite_lines` is False). diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index ff14e89..46059ab 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -39,7 +39,6 @@ LOG = getLogger('processor.TesserocrSegmentRegion') FILEGRP_IMG = 'OCR-D-IMG-CROP' -PADDING = 8 # extend detected region rectangles by how many (true) pixels? # (will be passed as padding to both BoundingBox and GetImage) # (actually, Tesseract honours padding only on the left and bottom, # whereas right and top are increased less) @@ -54,7 +53,7 @@ def __init__(self, *args, **kwargs): def process(self): """Performs (text) region segmentation with Tesseract on the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialize PAGE input files and their respective images, and remove any existing Region and ReadingOrder elements (unless `overwrite_regions` is False). @@ -153,7 +152,7 @@ def _process_page(self, it, page, page_image, page_xywh, page_id, file_id): # and its BlockPolygon() index = 0 while it and not it.Empty(RIL.BLOCK): - bbox = it.BoundingBox(RIL.BLOCK, padding=PADDING) + bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding']) points = points_from_x0y0x1y1(bbox) # add offset from any Border: xywh = xywh_from_points(points) @@ -249,7 +248,7 @@ def _process_page(self, it, page, page_image, page_xywh, page_id, file_id): # GetBinaryImage). # You have been warned! # get the raw image (masked by white space along the block polygon): - region_image, top, left = it.GetImage(RIL.BLOCK, PADDING, page_image) + region_image, top, left = it.GetImage(RIL.BLOCK, self.parameter['padding'], page_image) # update METS (add the image file): file_path = save_image_file(self.workspace, region_image, file_id + '_' + ID, diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index a74a168..38b0551 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -38,7 +38,7 @@ def __init__(self, *args, **kwargs): def process(self): """Performs word segmentation with Tesseract on the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the textline level, and remove any existing Word elements (unless `overwrite_words` is False). From 8e3b953b65b40bc5112ac97f25c51c44de6276ba Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Wed, 3 Jul 2019 12:21:18 +0200 Subject: [PATCH 6/6] no trace logging yet --- ocrd_tesserocr/recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index b7699d3..959200e 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -345,7 +345,7 @@ def _process_glyphs_in_word(self, result_it, word, word_xywh): for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 - LOG.trace('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) + #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break