diff --git a/.pylintrc b/.pylintrc index 67d8439..710b8b2 100644 --- a/.pylintrc +++ b/.pylintrc @@ -5,13 +5,22 @@ ignored-modules=cv2,tesserocr [MESSAGES CONTROL] disable = ungrouped-imports, - fixme, bad-continuation, missing-docstring, no-self-use, - too-many-arguments, superfluous-parens, invalid-name, line-too-long, + too-many-arguments, + too-many-branches, + too-many-statements, too-many-locals, too-few-public-methods, + wrong-import-order, + duplicate-code + +# allow indented whitespace (as required by interpreter): +no-space-check=empty-line + +# allow non-snake-case identifiers: +good-names=n,i diff --git a/CHANGELOG.md b/CHANGELOG.md index c42b210..c853e87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,26 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.2.3] - 2019-06-28 + +Changed: + * Use basename of input file for output name + * Use .xml filename extension for PAGE output + * Warn about existing border or regions in `crop` + * Use `PSM.SPARSE_TEXT` without tables in `crop` + * Filter unreliable regions in `crop` + * Add padding around border in `crop` + * Delete existing regions in `segment_region` + * Cover vertical text and tables in `segment_region` + * Add parameter `find_tables` in `segment_region` + * Add parameter `crop_polygons` in `segment_region` + * Add parameter `overwrite_regions` in `segment_region` + * Add parameter `overwrite_lines` in `segment_line` + * Add parameter `overwrite_words` in `segment_word` + * Add page/region-level processor `deskew` + * Add page/region/line-level processor `binarize` + * Respect AlternativeImage on all levels + ## [0.2.2] - 2019-05-20 Changed: diff --git a/ocrd_tesserocr/__init__.py b/ocrd_tesserocr/__init__.py index 20d4b86..c052054 100644 --- a/ocrd_tesserocr/__init__.py +++ b/ocrd_tesserocr/__init__.py @@ -11,3 +11,5 @@ from .segment_line import TesserocrSegmentLine from .segment_region import TesserocrSegmentRegion from .crop import TesserocrCrop +from .deskew import TesserocrDeskew +from .binarize import TesserocrBinarize diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py new file mode 100644 index 0000000..494c0bc --- /dev/null +++ b/ocrd_tesserocr/binarize.py @@ -0,0 +1,139 @@ +from __future__ import absolute_import + +import os.path +from tesserocr import ( + PyTessBaseAPI, + PSM, RIL +) + +from ocrd_utils import ( + getLogger, concat_padded, + MIMETYPE_PAGE +) +from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import ( + MetadataItemType, + LabelsType, LabelType, + AlternativeImageType, + TextRegionType, + to_xml +) +from ocrd import Processor + +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region, + image_from_line, + save_image_file, + membername +) + +TOOL = 'ocrd-tesserocr-binarize' +LOG = getLogger('processor.TesserocrBinarize') +FILEGRP_IMG = 'OCR-D-IMG-BIN' + +class TesserocrBinarize(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] + kwargs['version'] = OCRD_TOOL['version'] + super(TesserocrBinarize, self).__init__(*args, **kwargs) + + def process(self): + """Performs binarization with Tesseract on the workspace. + + Open and deserialize PAGE input files and their respective images, + then iterate over the element hierarchy down to the requested level. + + Set up Tesseract to recognize the segment image's layout, and get + the binarized image. Create an image file, and reference it as + AlternativeImage in the element and as file with a fileGrp USE + equal `OCR-D-IMG-BIN` in the workspace. + + Produce a new output file by serialising the resulting hierarchy. + """ + oplevel = self.parameter['operation_level'] + with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: + for n, input_file in enumerate(self.input_files): + file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) + pcgts = page_from_file(self.workspace.download_file(input_file)) + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) + + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + if oplevel == 'page': + tessapi.SetPageSegMode(PSM.AUTO) + self._process_segment(tessapi, RIL.BLOCK, page, page_image, page_xywh, + "page '%s'" % page_id, input_file.pageId, + file_id) + else: + regions = page.get_TextRegion() + page.get_TableRegion() + if not regions: + LOG.warning("Page '%s' contains no text regions", page_id) + for region in regions: + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) + if oplevel == 'region': + tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) + self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh, + "region '%s'" % region.id, input_file.pageId, + file_id + '_' + region.id) + elif isinstance(region, TextRegionType): + lines = region.get_TextLine() + if not lines: + LOG.warning("Page '%s' region '%s' contains no text lines", + page_id, region.id) + for line in lines: + line_image, line_xywh = image_from_line( + self.workspace, line, region_image, region_xywh) + tessapi.SetPageSegMode(PSM.SINGLE_LINE) + self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh, + "line '%s'" % line.id, input_file.pageId, + file_id + '_' + region.id + '_' + line.id) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + mimetype=MIMETYPE_PAGE, + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) + + def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, file_id): + tessapi.SetImage(image) + image_bin = None + layout = tessapi.AnalyseLayout() + if layout: + image_bin = layout.GetBinaryImage(ril) + if not image_bin: + LOG.error('Cannot binarize %s', where) + return + # update METS (add the image file): + file_path = save_image_file(self.workspace, image_bin, + file_id, + page_id=page_id, + file_grp=FILEGRP_IMG) + # update PAGE (reference the image file): + segment.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments="binarized")) diff --git a/ocrd_tesserocr/cli.py b/ocrd_tesserocr/cli.py index 9d836c8..eb3f0dc 100644 --- a/ocrd_tesserocr/cli.py +++ b/ocrd_tesserocr/cli.py @@ -6,6 +6,8 @@ from ocrd_tesserocr.segment_line import TesserocrSegmentLine from ocrd_tesserocr.segment_word import TesserocrSegmentWord from ocrd_tesserocr.crop import TesserocrCrop +from ocrd_tesserocr.deskew import TesserocrDeskew +from ocrd_tesserocr.binarize import TesserocrBinarize @click.command() @ocrd_cli_options @@ -31,3 +33,13 @@ def ocrd_tesserocr_recognize(*args, **kwargs): @ocrd_cli_options def ocrd_tesserocr_crop(*args, **kwargs): return ocrd_cli_wrap_processor(TesserocrCrop, *args, **kwargs) + +@click.command() +@ocrd_cli_options +def ocrd_tesserocr_deskew(*args, **kwargs): + return ocrd_cli_wrap_processor(TesserocrDeskew, *args, **kwargs) + +@click.command() +@ocrd_cli_options +def ocrd_tesserocr_binarize(*args, **kwargs): + return ocrd_cli_wrap_processor(TesserocrBinarize, *args, **kwargs) diff --git a/ocrd_tesserocr/common.py b/ocrd_tesserocr/common.py new file mode 100644 index 0000000..6aa4c1a --- /dev/null +++ b/ocrd_tesserocr/common.py @@ -0,0 +1,448 @@ +from __future__ import absolute_import + +import os.path +import sys +import io + +import numpy as np +from PIL import Image, ImageDraw + +from ocrd_utils import getLogger, xywh_from_points, polygon_from_points + +LOG = getLogger('') # to be refined by importer + +# to be refactored into core (as function in ocrd_utils): +def polygon_mask(image, coordinates): + mask = Image.new('L', image.size, 0) + ImageDraw.Draw(mask).polygon(coordinates, outline=1, fill=255) + return mask + +# to be refactored into core (as function in ocrd_utils): +def rotate_polygon(coordinates, angle, orig={'x': 0, 'y': 0}): + # if the region image has been rotated, we must also + # rotate the coordinates of the line + # (which relate to the top page image) + # in the same direction but with inverse transformation + # matrix (i.e. passive rotation), and + # (since the region was rotated around its center, + # but our coordinates are now relative to the top left) + # by first translating to center of region, then + # rotating around that center, and translating back: + # point := (point - region_center) * region_rotation + region_center + # moreover, since rotation has reshaped/expanded the image, + # the line coordinates must be offset by those additional pixels: + # point := point + 0.5 * (new_region_size - old_region_size) + angle = np.deg2rad(angle) + # active rotation: [[cos, -sin], [sin, cos]] + # passive rotation: [[cos, sin], [-sin, cos]] (inverse) + return [(orig['x'] + + (x - orig['x'])*np.cos(angle) + + (y - orig['y'])*np.sin(angle), + orig['y'] + - (x - orig['x'])*np.sin(angle) + + (y - orig['y'])*np.cos(angle)) + for x, y in coordinates] + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_page(workspace, page, + page_image, + page_id): + """Extract the Page image from the workspace. + + Given a PIL.Image of the page, `page_image`, + and the Page object logically associated with it, `page`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `page_image` (if a Border exists), + or by just returning `page_image` (otherwise). + + When using AlternativeImage, if the resulting page image + is larger than the annotated page, then pass down the page's + box coordinates with an offset of half the width/height difference. + + Return the extracted image, and the page's box coordinates, + relative to the source image (for passing down). + """ + page_xywh = {'x': 0, + 'y': 0, + 'w': page_image.width, + 'h': page_image.height} + # FIXME: remove PrintSpace here as soon as GT abides by the PAGE standard: + border = page.get_Border() or page.get_PrintSpace() + if border and border.get_Coords(): + LOG.debug("Using explictly set page border '%s' for page '%s'", + border.get_Coords().points, page_id) + page_xywh = xywh_from_points(border.get_Coords().points) + + alternative_image = page.get_AlternativeImage() + if alternative_image: + # (e.g. from page-level cropping, binarization, deskewing or despeckling) + # assumes implicit cropping (i.e. page_xywh has been applied already) + LOG.debug("Using AlternativeImage %d (%s) for page '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + page_id) + page_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + elif border: + page_image = page_image.crop( + box=(page_xywh['x'], + page_xywh['y'], + page_xywh['x'] + page_xywh['w'], + page_xywh['y'] + page_xywh['h'])) + # FIXME: mask away all GraphicRegion, SeparatorRegion etc which + # could overlay any text regions + # subtract offset from any increase in binary region size over source: + page_xywh['x'] -= 0.5 * max(0, page_image.width - page_xywh['w']) + page_xywh['y'] -= 0.5 * max(0, page_image.height - page_xywh['h']) + return page_image, page_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_region(workspace, region, + page_image, page_xywh): + """Extract the TextRegion image from a Page image. + + Given a PIL.Image of the page, `page_image`, + and its coordinates relative to the border, `page_xywh`, + and a TextRegion object logically contained in it, `region`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `page_image`. + + When cropping, respect any angle annotated for the region + (from deskewing) by rotating the cropped image, respectively. + Regardless, if the resulting region image is larger than + the annotated region, pass down the region's box coordinates + with an offset of half the width/height difference. + + Return the extracted image, and the region's box coordinates, + relative to the page image (for passing down). + """ + region_xywh = xywh_from_points(region.get_Coords().points) + # region angle: PAGE orientation is defined clockwise, + # whereas PIL/ndimage rotation is in mathematical direction: + region_xywh['angle'] = -(region.get_orientation() or 0) + alternative_image = region.get_AlternativeImage() + if alternative_image: + # (e.g. from region-level cropping, binarization, deskewing or despeckling) + LOG.debug("Using AlternativeImage %d (%s) for region '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + region.id) + region_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + else: + region_image = page_image.crop( + box=(region_xywh['x'] - page_xywh['x'], + region_xywh['y'] - page_xywh['y'], + region_xywh['x'] - page_xywh['x'] + region_xywh['w'], + region_xywh['y'] - page_xywh['y'] + region_xywh['h'])) + # FIXME: mask any overlapping regions (esp. Separator/Noise/Image) + # but we might need overlapping rules: e.g. an ImageRegion which + # properly contains our TextRegion should be completely ignored, but + # an ImageRegion which is properly contained in our TextRegion should + # be completely masked, while partial overlap may be more difficult + # to decide (use polygons?) + if region_xywh['angle']: + LOG.info("About to rotate region '%s' by %.2f°", + region.id, region_xywh['angle']) + region_image = region_image.rotate(region_xywh['angle'], + expand=True, + #resample=Image.BILINEAR, + fillcolor='white') + # subtract offset from any increase in binary region size over source: + region_xywh['x'] -= 0.5 * max(0, region_image.width - region_xywh['w']) + region_xywh['y'] -= 0.5 * max(0, region_image.height - region_xywh['h']) + return region_image, region_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_line(workspace, line, + region_image, region_xywh, + segmentation=None): + """Extract the TextLine image from a TextRegion image. + + Given a PIL.Image of the region, `region_image`, + and its coordinates relative to the page, `region_xywh`, + and a TextLine object logically contained in it, `line`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `region_image`. + + When cropping, respect any angle annotated for the region + (from deskewing) by compensating the line coordinates in + an inverse transformation (translation to center, rotation, + re-translation). Also, mind the difference between annotated + and actual size of the region (usually from deskewing), by + a respective offset into the image. Cropping uses a polygon + mask (not just the rectangle). + + If passed an optional labelling for the region, `segmentation`, + the mask is shrinked further to the largest overlapping line + label, which avoids seeing ascenders from lines below, and + descenders from lines above `line`. + + If the resulting line image is larger than the annotated line, + pass down the line's box coordinates with an offset of half + the width/height difference. + + Return the extracted image, and the line's box coordinates, + relative to the region image (for passing down). + """ + line_points = line.get_Coords().points + line_xywh = xywh_from_points(line_points) + line_polygon = [(x - region_xywh['x'], + y - region_xywh['y']) + for x, y in polygon_from_points(line_points)] + alternative_image = line.get_AlternativeImage() + if alternative_image: + # (e.g. from line-level cropping, deskewing or despeckling) + LOG.debug("Using AlternativeImage %d (%s) for line '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + line.id) + line_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + else: + # create a mask from the line polygon: + line_polygon = rotate_polygon(line_polygon, + region_xywh['angle'], + orig={'x': 0.5 * region_image.width, + 'y': 0.5 * region_image.height}) + line_mask = polygon_mask(region_image, line_polygon) + # create a background image from its median color + # (in case it has not been binarized yet): + region_array = np.asarray(region_image) + background = np.median(region_array, axis=[0, 1], keepdims=True) + region_array = np.broadcast_to(background.astype(np.uint8), region_array.shape) + line_image = Image.fromarray(region_array) + line_image.paste(region_image, mask=line_mask) + # recrop into a line: + bbox = line_mask.getbbox() + if bbox: + left, upper, right, lower = bbox + # keep upper/lower, regardless of h (no vertical padding) + # pad left/right if target width w is larger: + margin_x = (line_xywh['w'] - right + left) // 2 + left = max(0, left - margin_x) + right = min(line_mask.width, left + line_xywh['w']) + else: + left = line_xywh['x'] - region_xywh['x'] + upper = line_xywh['y'] - region_xywh['y'] + right = left + line_xywh['w'] + lower = upper + line_xywh['h'] + line_image = line_image.crop(box=(left, upper, right, lower)) + # subtract offset from any increase in binary line size over source: + line_xywh['x'] -= 0.5 * max(0, line_image.width - line_xywh['w']) + line_xywh['y'] -= 0.5 * max(0, line_image.height - line_xywh['h']) + return line_image, line_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_word(workspace, word, + line_image, line_xywh): + """Extract the Word image from a TextLine image. + + Given a PIL.Image of the line, `line_image`, + and its coordinates relative to the region, `line_xywh`, + and a Word object logically contained in it, `word`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `line_image`. + + When cropping, mind the difference between annotated + and actual size of the line (usually from deskewing), by + a respective offset into the image. Cropping uses a polygon + mask (not just the rectangle). + + If the resulting word image is larger than the annotated word, + pass down the word's box coordinates with an offset of half + the width/height difference. + + Return the extracted image, and the word's box coordinates, + relative to the line image (for passing down). + """ + word_points = word.get_Coords().points + word_xywh = xywh_from_points(word_points) + word_polygon = [(x - line_xywh['x'], + y - line_xywh['y']) + for x, y in polygon_from_points(word_points)] + alternative_image = word.get_AlternativeImage() + if alternative_image: + # (e.g. from word-level cropping or binarization) + LOG.debug("Using AlternativeImage %d (%s) for word '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + word.id) + word_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + else: + # create a mask from the word polygon: + word_mask = polygon_mask(line_image, word_polygon) + # create a background image from its median color + # (in case it has not been binarized yet): + line_array = np.asarray(line_image) + background = np.median(line_array, axis=[0, 1], keepdims=True) + line_array = np.broadcast_to(background.astype(np.uint8), line_array.shape) + word_image = Image.fromarray(line_array) + word_image.paste(line_image, mask=word_mask) + # recrop into a line: + bbox = word_mask.getbbox() + if bbox: + left, upper, right, lower = bbox + # keep upper/lower, regardless of h (no vertical padding) + # pad left/right if target width w is larger: + margin_x = (word_xywh['w'] - right + left) // 2 + left = max(0, left - margin_x) + right = min(word_mask.width, left + word_xywh['w']) + else: + left = word_xywh['x'] - line_xywh['x'] + upper = word_xywh['y'] - line_xywh['y'] + right = left + word_xywh['w'] + lower = upper + word_xywh['h'] + word_image = word_image.crop(box=(left, upper, right, lower)) + # subtract offset from any increase in binary line size over source: + word_xywh['x'] -= 0.5 * max(0, word_image.width - word_xywh['w']) + word_xywh['y'] -= 0.5 * max(0, word_image.height - word_xywh['h']) + return word_image, word_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def image_from_glyph(workspace, glyph, + word_image, word_xywh): + """Extract the Glyph image from a Word image. + + Given a PIL.Image of the word, `word_image`, + and its coordinates relative to the line, `word_xywh`, + and a Glyph object logically contained in it, `glyph`, + extract its PIL.Image from AlternativeImage (if it exists), + or via cropping from `word_image`. + + When cropping, mind the difference between annotated + and actual size of the word (usually from deskewing), by + a respective offset into the image. Cropping uses a polygon + mask (not just the rectangle). + + If the resulting glyph image is larger than the annotated glyph, + pass down the glyph's box coordinates with an offset of half + the width/height difference. + + Return the extracted image, and the glyph's box coordinates, + relative to the word image (for passing down). + """ + glyph_points = glyph.get_Coords().points + glyph_xywh = xywh_from_points(glyph_points) + glyph_polygon = [(x - word_xywh['x'], + y - word_xywh['y']) + for x, y in polygon_from_points(glyph_points)] + alternative_image = glyph.get_AlternativeImage() + if alternative_image: + # (e.g. from glyph-level cropping or binarization) + LOG.debug("Using AlternativeImage %d (%s) for glyph '%s'", + len(alternative_image), alternative_image[-1].get_comments(), + glyph.id) + glyph_image = workspace.resolve_image_as_pil( + alternative_image[-1].get_filename()) + else: + # create a mask from the glyph polygon: + glyph_mask = polygon_mask(word_image, glyph_polygon) + # create a background image from its median color + # (in case it has not been binarized yet): + word_array = np.asarray(word_image) + background = np.median(word_array, axis=[0, 1], keepdims=True) + word_array = np.broadcast_to(background.astype(np.uint8), word_array.shape) + glyph_image = Image.fromarray(word_array) + glyph_image.paste(word_image, mask=glyph_mask) + # recrop into a word: + bbox = glyph_mask.getbbox() + if bbox: + left, upper, right, lower = bbox + # keep upper/lower, regardless of h (no vertical padding) + # pad left/right if target width w is larger: + margin_x = (glyph_xywh['w'] - right + left) // 2 + left = max(0, left - margin_x) + right = min(glyph_mask.width, left + glyph_xywh['w']) + else: + left = glyph_xywh['x'] - word_xywh['x'] + upper = glyph_xywh['y'] - word_xywh['y'] + right = left + glyph_xywh['w'] + lower = upper + glyph_xywh['h'] + glyph_image = glyph_image.crop(box=(left, upper, right, lower)) + # subtract offset from any increase in binary word size over source: + glyph_xywh['x'] -= 0.5 * max(0, glyph_image.width - glyph_xywh['w']) + glyph_xywh['y'] -= 0.5 * max(0, glyph_image.height - glyph_xywh['h']) + return glyph_image, glyph_xywh + +# to be refactored into core (as method of ocrd.workspace.Workspace): +def save_image_file(workspace, image, + file_id, + page_id=None, + file_grp='OCR-D-IMG', # or -BIN? + format='PNG', + force=True): + """Store and reference an image as file into the workspace. + + Given a PIL.Image `image`, and an ID `file_id` to use in METS, + store the image under the fileGrp `file_grp` and physical page + `page_id` into the workspace (in a file name based on + the `file_grp`, `file_id` and `format` extension). + + Return the (absolute) path of the created file. + """ + image_bytes = io.BytesIO() + image.save(image_bytes, format=format) + file_path = os.path.join(file_grp, + file_id + '.' + format.lower()) + out = workspace.add_file( + ID=file_id, + file_grp=file_grp, + pageId=page_id, + local_filename=file_path, + mimetype='image/' + format.lower(), + content=image_bytes.getvalue(), + force=force) + LOG.info('created file ID: %s, file_grp: %s, path: %s', + file_id, file_grp, out.local_filename) + return file_path + +# to be refactored into core (as function in ocrd_utils): +def bbox_from_points(points): + """Constructs a numeric list representing a bounding box from polygon coordinates in page representation.""" + xys = [[int(p) for p in pair.split(',')] for pair in points.split(' ')] + minx = sys.maxsize + miny = sys.maxsize + maxx = 0 + maxy = 0 + for xy in xys: + if xy[0] < minx: + minx = xy[0] + if xy[0] > maxx: + maxx = xy[0] + if xy[1] < miny: + miny = xy[1] + if xy[1] > maxy: + maxy = xy[1] + return minx, miny, maxx, maxy + +# to be refactored into core (as function in ocrd_utils): +def points_from_bbox(minx, miny, maxx, maxy): + """Constructs polygon coordinates in page representation from a numeric list representing a bounding box.""" + return "%i,%i %i,%i %i,%i %i,%i" % ( + minx, miny, maxx, miny, maxx, maxy, minx, maxy) + +# to be refactored into core (as function in ocrd_utils): +def xywh_from_bbox(minx, miny, maxx, maxy): + """Converts a bounding box from a numeric list to a numeric dict representation.""" + return { + 'x': minx, + 'y': miny, + 'w': maxx - minx, + 'h': maxy - miny, + } + +# to be refactored into core (as function in ocrd_utils): +def bbox_from_xywh(xywh): + """Converts a bounding box from a numeric dict to a numeric list representation.""" + return ( + xywh['x'], + xywh['y'], + xywh['x'] + xywh['w'], + xywh['y'] + xywh['h'] + ) + +# to be refactored into core (as function in ocrd_utils): +def points_from_polygon(polygon): + """Converts polygon coordinates from a numeric list representation to a page representation.""" + return " ".join("%i,%i" % (x, y) for x, y in polygon) + +def membername(class_, val): + return next((k for k, v in class_.__dict__.items() if v == val), str(val)) diff --git a/ocrd_tesserocr/config.py b/ocrd_tesserocr/config.py index 1b81509..528b184 100644 --- a/ocrd_tesserocr/config.py +++ b/ocrd_tesserocr/config.py @@ -2,8 +2,6 @@ import json from pkg_resources import resource_string -import locale -locale.setlocale(locale.LC_ALL, 'C') # circumvent tesseract-ocr issue 1670 (which cannot be done on command line because Click requires an UTF-8 locale in Python 3) import tesserocr TESSDATA_PREFIX = os.environ['TESSDATA_PREFIX'] if 'TESSDATA_PREFIX' in os.environ else tesserocr.get_languages()[0] diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index 4a8f118..3edee91 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -1,83 +1,157 @@ from __future__ import absolute_import +import os.path + import tesserocr -from ocrd_utils import getLogger, concat_padded, points_from_xywh, MIMETYPE_PAGE +from ocrd_utils import ( + getLogger, concat_padded, + MIMETYPE_PAGE +) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( + MetadataItemType, + LabelsType, LabelType, CoordsType, - to_xml ) from ocrd_models.ocrd_page_generateds import BorderType - from ocrd import Processor -from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + bbox_from_points, points_from_bbox, + bbox_from_xywh +) -log = getLogger('processor.TesserocrCrop') +TOOL = 'ocrd-tesserocr-crop' +LOG = getLogger('processor.TesserocrCrop') class TesserocrCrop(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-crop'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrCrop, self).__init__(*args, **kwargs) def process(self): + """Performs page cropping with Tesseract on the workspace. + + Open and deserialize PAGE input files and their respective images. + Set up Tesseract to detect text blocks on each page, and find + the largest coordinate extent spanning all of them. Use this + extent in defining a Border, and add that to the page. + + Produce new output files by serialising the resulting hierarchy. """ - Performs the cropping. - """ + padding = self.parameter['padding'] + with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: - # print(self.input_file_grp) + # disable table detection here (tables count as text blocks), + # because we do not want to risk confusing the spine with + # a column separator and thus creeping into a neighbouring + # page: + tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): - # print(input_file) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) - log.debug("Cropping with tesseract") - tessapi.SetImage(image) + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + border = page.get_Border() + if border: + left, top, right, bottom = bbox_from_points(border.get_Coords().points) + LOG.warning('Overwriting existing Border: %i:%i,%i:%i', + left, top, right, bottom) + regions = page.get_TextRegion() + if regions: + min_x = image.width + min_y = image.height + max_x = 0 + max_y = 0 + for region in regions: + left, top, right, bottom = bbox_from_points(region.get_Coords().points) + min_x = min(min_x, left) + min_y = min(min_y, top) + max_x = max(max_x, right) + max_y = max(max_y, bottom) + LOG.warning('Ignoring extent from existing TextRegions: %i:%i,%i:%i', + min_x, max_x, min_y, max_y) + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + LOG.debug("Cropping with tesseract") + tessapi.SetImage(page_image) + # PSM.SPARSE_TEXT: get as much text as possible in no particular order + # PSM.AUTO (default): includes tables (dangerous) + tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) # # helper variables for saving the box coordinates # - min_x = image.width - min_y = image.height + min_x = page_image.width + min_y = page_image.height max_x = 0 max_y = 0 - - # iterate over all boxes and compare their extent - # to the min and max values + # iterate over all text blocks and compare their + # bbox extent to the running min and max values for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): - points, index = points_from_xywh(component[1]), component[2] - + image, xywh, index, para = component # # the region reference in the reading order element # ID = "region%04d" % index - log.debug("Detected region '%s': %s", ID, points) - - for pair in points.split(' '): - x, y = (int(pair.split(',')[0]), int(pair.split(',')[1])) - if x < min_x: - min_x = x - if y < min_y: - min_y = y - elif x > max_x: - max_x = x - elif y > max_y: - max_y = y - log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) + left, top, right, bottom = bbox_from_xywh(xywh) + LOG.debug("Detected text region '%s': %i:%i,%i:%i", + ID, left, right, top, bottom) + # filter region results: + bin_bbox = image.getbbox() + if not bin_bbox: + # this does happen! + LOG.info("Ignoring region '%s' because its binarization is empty", ID) + continue + if bin_bbox[2]-bin_bbox[0] < 30 or bin_bbox[3]-bin_bbox[1] < 30: + # we must be conservative here: page numbers are tiny regions, too! + LOG.info("Ignoring region '%s' because its binarization is too small", ID) + continue + min_x = min(min_x, left) + min_y = min(min_y, top) + max_x = max(max_x, right) + max_y = max(max_y, bottom) + LOG.debug("Updated page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) # # set the identified page border # - brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) - pcgts.get_Page().set_Border(brd) + if min_x < max_x and min_y < max_y: + # add padding: + min_x = max(min_x - padding, 0) + max_x = min(max_x + padding, page_image.width) + min_y = max(min_y - padding, 0) + max_y = min(max_y + padding, page_image.height) + LOG.debug("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) + border = BorderType(Coords=CoordsType( + points_from_bbox(min_x, min_y, max_x, max_y))) + page.set_Border(border) + else: + LOG.error("Cannot find valid extent for page '%s'", page_id) - ID = concat_padded(self.output_file_grp, n) + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename='%s/%s' % (self.output_file_grp, ID), - content=to_xml(pcgts).encode('utf-8'), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py new file mode 100644 index 0000000..9be2335 --- /dev/null +++ b/ocrd_tesserocr/deskew.py @@ -0,0 +1,224 @@ +from __future__ import absolute_import + +import os.path +import math +from tesserocr import ( + PyTessBaseAPI, + PSM, OEM, + Orientation, + WritingDirection, + TextlineOrder +) + +from ocrd_utils import ( + getLogger, concat_padded, + MIMETYPE_PAGE +) +from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import ( + MetadataItemType, + LabelsType, LabelType, + AlternativeImageType, + TextRegionType, PageType, + to_xml +) +from ocrd import Processor + +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region, + save_image_file, + membername +) + +TOOL = 'ocrd-tesserocr-deskew' +LOG = getLogger('processor.TesserocrDeskew') +FILEGRP_IMG = 'OCR-D-IMG-DESKEW' + +class TesserocrDeskew(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] + kwargs['version'] = OCRD_TOOL['version'] + super(TesserocrDeskew, self).__init__(*args, **kwargs) + + def process(self): + """Performs deskewing of the page / region with Tesseract on the workspace. + + Open and deserialise PAGE input files and their respective images, + then iterate over the element hierarchy down to the region level + for all text and table regions. + + Set up Tesseract to recognise the region image's orientation, skew + and script (with both OSD and AnalyseLayout). Rotate the image + accordingly, and annotate the angle, readingDirection and textlineOrder. + + Create a corresponding image file, and reference it as AlternativeImage + in the region element and as file with a fileGrp USE `OCR-D-IMG-DESKEW` + in the workspace. + + Produce a new output file by serialising the resulting hierarchy. + """ + oplevel = self.parameter['operation_level'] + with PyTessBaseAPI( + path=TESSDATA_PREFIX, + lang="osd", # osd required for legacy init! + oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD! + psm=PSM.AUTO_OSD + ) as tessapi: + for n, input_file in enumerate(self.input_files): + file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) + pcgts = page_from_file(self.workspace.download_file(input_file)) + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + LOG.info("Deskewing on '%s' level in page '%s'", oplevel, page_id) + + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + if oplevel == 'page': + self._process_segment(tessapi, page, page_image, page_xywh, + "page '%s'" % page_id, input_file.pageId, + file_id) + else: + regions = page.get_TextRegion() + page.get_TableRegion() + if not regions: + LOG.warning("Page '%s' contains no text regions", page_id) + for region in regions: + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) + self._process_segment(tessapi, region, region_image, region_xywh, + "region '%s'" % region.id, input_file.pageId, + file_id + '_' + region.id) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + mimetype=MIMETYPE_PAGE, + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) + + def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_id): + comments = 'cropped' + angle = 0. + tessapi.SetImage(image) + #tessapi.SetPageSegMode(PSM.AUTO_OSD) + # + # orientation/script + # + osr = tessapi.DetectOrientationScript() + if osr: + assert osr['orient_conf'] and not math.isnan(osr['orient_conf']), \ + "orientation detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)" + if osr['orient_conf'] < 10: + LOG.info('ignoring OSD orientation result %d° due to low confidence %.0f in %s', + osr['orient_deg'], osr['orient_conf'], where) + else: + LOG.info('applying OSD orientation result %d° with high confidence %.0f in %s', + osr['orient_deg'], osr['orient_conf'], where) + angle = osr['orient_deg'] + if angle: + comments += ',rotated-%d' % angle + assert osr['script_conf'] and not math.isnan(osr['script_conf']), \ + "script detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)" + if osr['script_conf'] < 10: + LOG.info('ignoring OSD script result "%s" due to low confidence %.0f in %s', + osr['script_name'], osr['script_conf'], where) + else: + LOG.info('applying OSD script result "%s" with high confidence %.0f in %s', + osr['script_name'], osr['script_conf'], where) + segment.set_primaryScript(osr['script_name']) + else: + LOG.warning('no OSD result in %s', where) + # + # orientation/skew + # + layout = tessapi.AnalyseLayout() + if layout: + orientation, writing_direction, textline_order, deskew_angle = layout.Orientation() + LOG.info('orientation/deskewing for %s: %s / %s / %s / %.3f', where, + membername(Orientation, orientation), + membername(WritingDirection, writing_direction), + membername(TextlineOrder, textline_order), + deskew_angle) + # clockwise rotation, as defined in Tesseract OrientationIdToValue: + angle2 = { + Orientation.PAGE_RIGHT: 270, + Orientation.PAGE_DOWN: 180, + Orientation.PAGE_LEFT: 90 + }.get(orientation, 0) + if angle2 != angle: + LOG.warning('inconsistent angles from layout analysis (%d) and orientation detection (%d) in %s', + angle2, angle, where) + deskew_angle *= - 180 / math.pi + if int(deskew_angle): + comments += ',deskewed' + # We could rotate the image by transposition (which is more accurate + # than the general method below), but then the coordinates – + # which are still relative to `imageFilename` – of all the elements + # contained in this segment (i.e. any TextLine if `segment` is TextRegion, + # and any TextRegion if `segment` is Page) will have to be _transposed_ + # (instead of rotated) as well. But PAGE consumers have little chance + # of knowing which method producers chose, so here we generally decide + # to drop this mechanism: + # if angle: + # image = image.transpose({ + # 90: Image.ROTATE_90, + # 180: Image.ROTATE_180, + # 270: Image.ROTATE_270 + # }.get(angle)) # no default + # angle += deskew_angle + if angle: + # Tesseract layout analysis already rotates the image, even for each + # sub-segment (depending on RIL), but the accuracy is not as good + # as setting the image to the sub-segments and running without iterator. + # (These images can be queried via GetBinaryImage/GetImage, cf. segment_region) + # Unfortunately, it does _not_ use expand=True, but chops off corners. + # So we must do it here from the original image ourself: + image = image.rotate(-angle, expand=True, fillcolor='white') + angle = 180 - (180 - angle) % 360 # map to [-179.999,180] + # FIXME: remove that condition as soon as PAGE has orientation on PageType: + if not isinstance(segment, PageType): + segment.set_orientation(angle) + if isinstance(segment, (TextRegionType, PageType)): + segment.set_readingDirection({ + WritingDirection.LEFT_TO_RIGHT: 'left-to-right', + WritingDirection.RIGHT_TO_LEFT: 'right-to-left', + WritingDirection.TOP_TO_BOTTOM: 'top-to-bottom' + }.get(writing_direction, 'bottom-to-top')) + segment.set_textLineOrder({ + TextlineOrder.LEFT_TO_RIGHT: 'left-to-right', + TextlineOrder.RIGHT_TO_LEFT: 'right-to-left', + TextlineOrder.TOP_TO_BOTTOM: 'top-to-bottom' + }.get(textline_order, 'bottom-to-top')) + # baseline = layout.Baseline(RIL.BLOCK) + # if baseline: + # points = points_from_x0y0x1y1(list(baseline[0]) + list(baseline[1])) + # segment.add_Baseline(BaselineType(points=points)) + # update METS (add the image file): + file_path = save_image_file(self.workspace, image, + file_id, + page_id=page_id, + file_grp=FILEGRP_IMG) + # update PAGE (reference the image file): + segment.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments=comments)) diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index 3b35183..31e756a 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -3,6 +3,27 @@ "git_url": "https://github.com/OCR-D/ocrd_tesserocr", "dockerhub": "ocrd/tesserocr", "tools": { + "ocrd-tesserocr-deskew": { + "executable": "ocrd-tesserocr-deskew", + "categories": ["Image preprocessing"], + "description": "Deskew pages or regions", + "input_file_grp": [ + "OCR-D-IMG", + "OCR-D-SEG-BLOCK" + ], + "output_file_grp": [ + "OCR-D-DESKEW-BLOCK" + ], + "steps": ["preprocessing/optimization/deskewing"], + "parameters": { + "operation_level": { + "type": "string", + "enum": ["page","region"], + "default": "region", + "description": "PAGE XML hierarchy level to operate on" + } + } + }, "ocrd-tesserocr-recognize": { "executable": "ocrd-tesserocr-recognize", "categories": ["Text recognition and optimization"], @@ -46,7 +67,29 @@ "OCR-D-SEG-BLOCK" ], "steps": ["layout/segmentation/region"], - "parameters": {} + "parameters": { + "overwrite_regions": { + "type": "boolean", + "default": true, + "description": "remove existing layout and text annotation below the Page level" + }, + "padding": { + "type": "number", + "format": "integer", + "description": "extend detected region rectangles by this many (true) pixels", + "default": 8 + }, + "crop_polygons": { + "type": "boolean", + "default": false, + "description": "annotate polygon coordinates instead of rectangles, and create cropped AlternativeImage masked by the polygon outlines" + }, + "find_tables": { + "type": "boolean", + "default": true, + "description": "recognise table regions (textord_tabfind_find_tables)" + } + } }, "ocrd-tesserocr-segment-line": { "executable": "ocrd-tesserocr-segment-line", @@ -59,7 +102,13 @@ "OCR-D-SEG-LINE" ], "steps": ["layout/segmentation/line"], - "parameters": {} + "parameters": { + "overwrite_lines": { + "type": "boolean", + "default": true, + "description": "remove existing layout and text annotation below the TextRegion level" + } + } }, "ocrd-tesserocr-segment-word": { "executable": "ocrd-tesserocr-segment-word", @@ -72,7 +121,13 @@ "OCR-D-SEG-WORD" ], "steps": ["layout/segmentation/word"], - "parameters": {} + "parameters": { + "overwrite_words": { + "type": "boolean", + "default": true, + "description": "remove existing layout and text annotation below the TextLine level" + } + } }, "ocrd-tesserocr-crop": { "executable": "ocrd-tesserocr-crop", @@ -85,7 +140,37 @@ "OCR-D-IMG-CROPPED" ], "steps": ["preprocessing/optimization/cropping"], - "parameters" : {} + "parameters" : { + "padding": { + "type": "number", + "format": "integer", + "description": "extend detected border by this many (true) pixels on every side", + "default": 4 + } + } + }, + "ocrd-tesserocr-binarize": { + "executable": "ocrd-tesserocr-binarize", + "categories": ["Image preprocessing"], + "description": "Binarize pages, regions or lines", + "input_file_grp": [ + "OCR-D-IMG", + "OCR-D-SEG-BLOCK", + "OCR-D-SEG-LINE" + ], + "output_file_grp": [ + "OCR-D-BIN-BLOCK", + "OCR-D-BIN-LINE" + ], + "steps": ["preprocessing/optimization/binarization"], + "parameters": { + "operation_level": { + "type": "string", + "enum": ["page", "region", "line"], + "default": "region", + "description": "PAGE XML hierarchy level to operate on" + } + } } } } diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 86a2603..959200e 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -1,14 +1,14 @@ from __future__ import absolute_import -import math +import os.path from tesserocr import ( RIL, PSM, - PyTessBaseAPI, get_languages, - Orientation, TextlineOrder, WritingDirection) + PyTessBaseAPI, get_languages) from ocrd_utils import ( getLogger, concat_padded, - polygon_from_points, xywh_from_points, points_from_x0y0x1y1, + points_from_x0y0x1y1, + xywh_from_points, points_from_xywh, MIMETYPE_PAGE) from ocrd_models.ocrd_page import ( CoordsType, @@ -19,25 +19,33 @@ to_xml) from ocrd_modelfactory import page_from_file from ocrd import Processor + from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region, + image_from_line, + image_from_word, + image_from_glyph +) -log = getLogger('processor.TesserocrRecognize') +TOOL = 'ocrd-tesserocr-recognize' +LOG = getLogger('processor.TesserocrRecognize') CHOICE_THRESHOLD_NUM = 6 # maximum number of choices to query and annotate CHOICE_THRESHOLD_CONF = 0.2 # maximum score drop from best choice to query and annotate -MAX_ELEMENTS = 500 # maximum number of lower level elements embedded within each element (for word/glyph iterators) class TesserocrRecognize(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-recognize'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrRecognize, self).__init__(*args, **kwargs) def process(self): """Perform OCR recognition with Tesseract on the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested `textequiv_level`. If `overwrite_words` is enabled and any layout annotation below the line level already exists, then remove it @@ -46,10 +54,11 @@ def process(self): the appropriate mode and `model`. Create new elements below the line level if necessary. Put text results and confidence values into new TextEquiv at `textequiv_level`, and make the higher levels consistent - with that (by concatenation joined by whitespace). Produce new output - files by serialising the resulting hierarchy. + with that (by concatenation joined by whitespace). + + Produce new output files by serialising the resulting hierarchy. """ - log.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages()) + LOG.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages()) maxlevel = self.parameter['textequiv_level'] model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: @@ -58,7 +67,8 @@ def process(self): if sub_model not in get_languages()[1]: raise Exception("configured model " + sub_model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: - log.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) + LOG.info("Using model '%s' in %s for recognition at the %s level", + model, get_languages()[0], maxlevel) # todo: populate GetChoiceIterator() with LSTM models, too: #tessapi.SetVariable("lstm_choice_mode", "2") # todo: determine relevancy of these variables: @@ -107,66 +117,60 @@ def process(self): # user_words_file # user_patterns_file for (n, input_file) in enumerate(self.input_files): - log.info("INPUT FILE %i / %s", n, input_file) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - # TODO use binarized / gray - pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) - tessapi.SetImage(pil_image) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", - name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize']['steps'][0], - value='ocrd-tesserocr-recognize', + name=self.ocrd_tool['steps'][0], + value=TOOL, # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this # what we want here is `externalModel="ocrd-tool" externalId="parameters"` Labels=[LabelsType(#externalRef="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) - log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId()) - regions = pcgts.get_Page().get_TextRegion() + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + #tessapi.SetImage(page_image) + LOG.info("Processing page '%s'", page_id) + regions = page.get_TextRegion() if not regions: - log.warning("Page contains no text regions") - self._process_regions(regions, maxlevel, tessapi) + LOG.warning("Page '%s' contains no text regions", page_id) + else: + self._process_regions(tessapi, regions, page_image, page_xywh) page_update_higher_textequiv_levels(maxlevel, pcgts) - ID = concat_padded(self.output_file_grp, n) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename='%s/%s' % (self.output_file_grp, ID), - content=to_xml(pcgts), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) - def _process_regions(self, regions, maxlevel, tessapi): + def _process_regions(self, tessapi, regions, page_image, page_xywh): for region in regions: - log.debug("Recognizing text in region '%s'", region.id) - # todo: determine if and how this can still be used for region classification: - # result_it = tessapi.GetIterator() - # if not result_it or result_it.Empty(RIL.BLOCK) - # ptype = result_it.BlockType() - # PT.UNKNOWN - # PT.FLOWING_TEXT - # PT.HEADING_TEXT - # PT.PULLOUT_TEXT - # PT.EQUATION - # PT.TABLE - # PT.VERTICAL_TEXT - # PT.CAPTION_TEXT - # PT.HORZ_LINE - # PT.VERT_LINE - # PT.NOISE - # PT.COUNT - # ... - if maxlevel == 'region': - region_xywh = xywh_from_points(region.get_Coords().points) - tessapi.SetRectangle(region_xywh['x'], region_xywh['y'], region_xywh['w'], region_xywh['h']) + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) + if self.parameter['textequiv_level'] == 'region': + tessapi.SetImage(region_image) tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) + #if region.get_primaryScript() not in tessapi.GetLoadedLanguages()... + LOG.debug("Recognizing text in region '%s'", region.id) region_text = tessapi.GetUTF8Text().rstrip("\n\f") region_conf = tessapi.MeanTextConf()/100.0 # iterator scores are arithmetic averages, too if region.get_TextEquiv(): - log.warning("Region '%s' already contained text results", region.id) + LOG.warning("Region '%s' already contained text results", region.id) region.set_TextEquiv([]) # todo: consider SetParagraphSeparator region.add_TextEquiv(TextEquivType(Unicode=region_text, conf=region_conf)) @@ -174,24 +178,27 @@ def _process_regions(self, regions, maxlevel, tessapi): ## line, word, or glyph level: textlines = region.get_TextLine() if not textlines: - log.warning("Region '%s' contains no text lines", region.id) + LOG.warning("Region '%s' contains no text lines", region.id) else: - self._process_lines(textlines, maxlevel, tessapi) + self._process_lines(tessapi, textlines, region_image, region_xywh) - def _process_lines(self, textlines, maxlevel, tessapi): + def _process_lines(self, tessapi, textlines, region_image, region_xywh): for line in textlines: if self.parameter['overwrite_words']: line.set_Word([]) - log.debug("Recognizing text in line '%s'", line.id) - line_xywh = xywh_from_points(line.get_Coords().points) - # log.debug("xywh: %s", line_xywh) - tessapi.SetRectangle(line_xywh['x'], line_xywh['y'], line_xywh['w'], line_xywh['h']) - tessapi.SetPageSegMode(PSM.SINGLE_LINE) # RAW_LINE fails with Tesseract 3 models and is worse with Tesseract 4 models - if maxlevel == 'line': + line_image, line_xywh = image_from_line( + self.workspace, line, region_image, region_xywh) + # todo: Tesseract works better if the line images have a 5px margin everywhere + tessapi.SetImage(line_image) + # RAW_LINE fails with pre-LSTM models, but sometimes better with LSTM models + tessapi.SetPageSegMode(PSM.SINGLE_LINE) + #if line.get_primaryScript() not in tessapi.GetLoadedLanguages()... + LOG.debug("Recognizing text in line '%s'", line.id) + if self.parameter['textequiv_level'] == 'line': line_text = tessapi.GetUTF8Text().rstrip("\n\f") line_conf = tessapi.MeanTextConf()/100.0 # iterator scores are arithmetic averages, too if line.get_TextEquiv(): - log.warning("Line '%s' already contained text results", line.id) + LOG.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) # todo: consider BlankBeforeWord, SetLineSeparator line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf)) @@ -200,61 +207,75 @@ def _process_lines(self, textlines, maxlevel, tessapi): words = line.get_Word() if words: ## external word layout: - log.warning("Line '%s' contains words already, recognition might be suboptimal", line.id) - self._process_existing_words(words, maxlevel, tessapi) + LOG.warning("Line '%s' contains words already, recognition might be suboptimal", line.id) + self._process_existing_words(tessapi, words, line_image, line_xywh) else: ## internal word and glyph layout: tessapi.Recognize() - self._process_words_in_line(line, maxlevel, tessapi.GetIterator()) + self._process_words_in_line(tessapi.GetIterator(), line, line_xywh) - def _process_words_in_line(self, line, maxlevel, result_it): - for word_no in range(0, MAX_ELEMENTS): # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD) - if not result_it: - log.error("No iterator at '%s'", line.id) - break - if result_it.Empty(RIL.WORD): - log.warning("No word in line '%s'", line.id) - break + def _process_words_in_line(self, result_it, line, line_xywh): + if not result_it or result_it.Empty(RIL.WORD): + LOG.warning("No text in line '%s'", line.id) + return + # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD): + word_no = 0 + while result_it and not result_it.Empty(RIL.WORD): word_id = '%s_word%04d' % (line.id, word_no) - log.debug("Recognizing text in word '%s'", word_id) - word_bbox = result_it.BoundingBox(RIL.WORD) - word = WordType(id=word_id, Coords=CoordsType(points_from_x0y0x1y1(word_bbox))) + LOG.debug("Decoding text in word '%s'", word_id) + bbox = result_it.BoundingBox(RIL.WORD) + points = points_from_x0y0x1y1(bbox) + # add offset from image: + xywh = xywh_from_points(points) + xywh['x'] += line_xywh['x'] + xywh['y'] += line_xywh['y'] + points = points_from_xywh(xywh) + word = WordType(id=word_id, Coords=CoordsType(points)) line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: - word_style = TextStyleType(fontSize=word_attributes['pointsize'] if 'pointsize' in word_attributes else None, - fontFamily=word_attributes['font_name'] if 'font_name' in word_attributes else None, - bold=None if 'bold' not in word_attributes else word_attributes['bold'], - italic=None if 'italic' not in word_attributes else word_attributes['italic'], - underlined=None if 'underlined' not in word_attributes else word_attributes['underlined'], - monospace=None if 'monospace' not in word_attributes else word_attributes['monospace'], - serif=None if 'serif' not in word_attributes else word_attributes['serif'] - ) + word_style = TextStyleType( + fontSize=word_attributes['pointsize'] + if 'pointsize' in word_attributes else None, + fontFamily=word_attributes['font_name'] + if 'font_name' in word_attributes else None, + bold=word_attributes['bold'] + if 'bold' in word_attributes else None, + italic=word_attributes['italic'] + if 'italic' in word_attributes else None, + underlined=word_attributes['underlined'] + if 'underlined' in word_attributes else None, + monospace=word_attributes['monospace'] + if 'monospace' in word_attributes else None, + serif=word_attributes['serif'] + if 'serif' in word_attributes else None) word.set_TextStyle(word_style) # (or somewhere in custom attribute?) # add word annotation unconditionally (i.e. even for glyph level): - word.add_TextEquiv(TextEquivType(Unicode=result_it.GetUTF8Text(RIL.WORD), conf=result_it.Confidence(RIL.WORD)/100)) - if maxlevel == 'word': - pass - else: - self._process_glyphs_in_word(word, result_it) + word.add_TextEquiv(TextEquivType( + Unicode=result_it.GetUTF8Text(RIL.WORD), + conf=result_it.Confidence(RIL.WORD)/100)) + if self.parameter['textequiv_level'] != 'word': + self._process_glyphs_in_word(result_it, word, xywh) if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break else: + word_no += 1 result_it.Next(RIL.WORD) - def _process_existing_words(self, words, maxlevel, tessapi): + def _process_existing_words(self, tessapi, words, line_image, line_xywh): for word in words: - log.debug("Recognizing text in word '%s'", word.id) - word_xywh = xywh_from_points(word.get_Coords().points) - tessapi.SetRectangle(word_xywh['x'], word_xywh['y'], word_xywh['w'], word_xywh['h']) + word_image, word_xywh = image_from_word( + self.workspace, word, line_image, line_xywh) + tessapi.SetImage(word_image) tessapi.SetPageSegMode(PSM.SINGLE_WORD) - if maxlevel == 'word': + if self.parameter['textequiv_level'] == 'word': + LOG.debug("Recognizing text in word '%s'", word.id) word_text = tessapi.GetUTF8Text().rstrip("\n\f") word_conf = tessapi.AllWordConfidences() word_conf = word_conf[0]/100.0 if word_conf else 0.0 if word.get_TextEquiv(): - log.warning("Word '%s' already contained text results", word.id) + LOG.warning("Word '%s' already contained text results", word.id) word.set_TextEquiv([]) # todo: consider WordFontAttributes (TextStyle) etc (if not word.get_TextStyle()) word.add_TextEquiv(TextEquivType(Unicode=word_text, conf=word_conf)) @@ -263,62 +284,68 @@ def _process_existing_words(self, words, maxlevel, tessapi): glyphs = word.get_Glyph() if glyphs: ## external glyph layout: - log.warning("Word '%s' contains glyphs already, recognition might be suboptimal", word.id) - self._process_existing_glyphs(glyphs, tessapi) + LOG.warning("Word '%s' contains glyphs already, recognition might be suboptimal", word.id) + self._process_existing_glyphs(tessapi, glyphs, word_image, word_xywh) else: ## internal glyph layout: tessapi.Recognize() - self._process_glyphs_in_word(word, tessapi.GetIterator()) + self._process_glyphs_in_word(tessapi.GetIterator(), word, word_xywh) - def _process_existing_glyphs(self, glyphs, tessapi): + def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh): for glyph in glyphs: - log.debug("Recognizing glyph in word '%s'", glyph.id) - glyph_xywh = xywh_from_points(glyph.get_Coords().points) - tessapi.SetRectangle(glyph_xywh['x'], glyph_xywh['y'], glyph_xywh['w'], glyph_xywh['h']) + glyph_image, glyph_xywh = image_from_glyph( + self.workspace, glyph, word_image, word_xywh) + tessapi.SetImage(glyph_image) tessapi.SetPageSegMode(PSM.SINGLE_CHAR) + LOG.debug("Recognizing text in glyph '%s'", glyph.id) if glyph.get_TextEquiv(): - log.warning("Glyph '%s' already contained text results", glyph.id) + LOG.warning("Glyph '%s' already contained text results", glyph.id) glyph.set_TextEquiv([]) #glyph_text = tessapi.GetUTF8Text().rstrip("\n\f") glyph_conf = tessapi.AllWordConfidences() glyph_conf = glyph_conf[0]/100.0 if glyph_conf else 0.0 - #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) + #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) result_it = tessapi.GetIterator() if not result_it or result_it.Empty(RIL.SYMBOL): - log.error("No glyph here") + LOG.error("No text in glyph '%s'", glyph.id) continue choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 - #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) + #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) - def _process_glyphs_in_word(self, word, result_it): - for glyph_no in range(0, MAX_ELEMENTS): # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL) - if not result_it: - log.error("No iterator at '%s'", word.id) - break - if result_it.Empty(RIL.SYMBOL): - log.debug("No glyph here") - break + def _process_glyphs_in_word(self, result_it, word, word_xywh): + if not result_it or result_it.Empty(RIL.SYMBOL): + LOG.debug("No glyph in word '%s'", word.id) + return + # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL): + glyph_no = 0 + while result_it and not result_it.Empty(RIL.SYMBOL): glyph_id = '%s_glyph%04d' % (word.id, glyph_no) - log.debug("Recognizing text in glyph '%s'", glyph_id) + LOG.debug("Decoding text in glyph '%s'", glyph_id) # glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice? glyph_conf = result_it.Confidence(RIL.SYMBOL)/100 # equals first choice? - #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) - glyph_bbox = result_it.BoundingBox(RIL.SYMBOL) - glyph = GlyphType(id=glyph_id, Coords=CoordsType(points_from_x0y0x1y1(glyph_bbox))) + #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) + bbox = result_it.BoundingBox(RIL.SYMBOL) + points = points_from_x0y0x1y1(bbox) + # add offset from image: + xywh = xywh_from_points(points) + xywh['x'] += word_xywh['x'] + xywh['y'] += word_xywh['y'] + points = points_from_xywh(xywh) + glyph = GlyphType(id=glyph_id, Coords=CoordsType(points)) word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 - #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) + #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break @@ -327,6 +354,7 @@ def _process_glyphs_in_word(self, word, result_it): if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL): break else: + glyph_no += 1 result_it.Next(RIL.SYMBOL) def page_update_higher_textequiv_levels(level, pcgts): diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 894c5c7..4890237 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -1,53 +1,109 @@ from __future__ import absolute_import -from tesserocr import PyTessBaseAPI, RIL + +import os.path +from tesserocr import PyTessBaseAPI, RIL, PSM + from ocrd import Processor -from ocrd_utils import getLogger, concat_padded, points_from_xywh, polygon_from_points, xywh_from_points, MIMETYPE_PAGE +from ocrd_utils import ( + getLogger, concat_padded, + points_from_xywh, + MIMETYPE_PAGE +) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, + LabelType, LabelsType, + MetadataItemType, TextLineType, - to_xml ) -from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region +) -log = getLogger('processor.TesserocrSegmentLine') +TOOL = 'ocrd-tesserocr-segment-line' +LOG = getLogger('processor.TesserocrSegmentLine') class TesserocrSegmentLine(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-line'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentLine, self).__init__(*args, **kwargs) def process(self): + """Performs (text) line segmentation with Tesseract on the workspace. + + Open and deserialize PAGE input files and their respective images, + then iterate over the element hierarchy down to the region level, + and remove any existing TextLine elements (unless `overwrite_lines` + is False). + + Set up Tesseract to detect lines, and add each one to the region + at the detected coordinates. + + Produce a new output file by serialising the resulting hierarchy. """ - Performs the line segmentation. - """ - with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: + overwrite_lines = self.parameter['overwrite_lines'] + + with PyTessBaseAPI( + psm=PSM.SINGLE_BLOCK, + path=TESSDATA_PREFIX + ) as tessapi: for (n, input_file) in enumerate(self.input_files): + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - image_url = pcgts.get_Page().imageFilename - for region in pcgts.get_Page().get_TextRegion(): - log.debug("Detecting lines in %s with tesseract", region.id) - image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(region.get_Coords().points)) - tessapi.SetImage(image) - offset = xywh_from_points(region.get_Coords().points) - for (line_no, component) in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True)): + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + + for region in page.get_TextRegion(): + if region.get_TextLine(): + if overwrite_lines: + LOG.info('removing existing TextLines in region "%s"', region.id) + region.set_TextLine([]) + else: + LOG.warning('keeping existing TextLines in region "%s"', region.id) + LOG.debug("Detecting lines in region '%s'", region.id) + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) + tessapi.SetImage(region_image) + for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] - line_xywh['x'] += offset['x'] - line_xywh['y'] += offset['y'] + line_xywh['x'] += region_xywh['x'] + line_xywh['y'] += region_xywh['y'] line_points = points_from_xywh(line_xywh) - region.add_TextLine(TextLineType(id=line_id, Coords=CoordsType(line_points))) - ID = concat_padded(self.output_file_grp, n) + region.add_TextLine(TextLineType( + id=line_id, Coords=CoordsType(line_points))) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename='%s/%s' % (self.output_file_grp, ID), - content=to_xml(pcgts).encode('utf-8'), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index c8df11c..46059ab 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -1,9 +1,21 @@ from __future__ import absolute_import -import tesserocr -from ocrd_utils import getLogger, concat_padded, points_from_x0y0x1y1, xywh_from_points, MIMETYPE_PAGE + +import os.path +from tesserocr import ( + PyTessBaseAPI, + PSM, RIL, PT +) + +from ocrd_utils import ( + getLogger, concat_padded, + points_from_x0y0x1y1, + points_from_xywh, xywh_from_points, + MIMETYPE_PAGE) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - CoordsType, + MetadataItemType, + LabelsType, LabelType, + CoordsType, AlternativeImageType, OrderedGroupType, ReadingOrderType, RegionRefIndexedType, @@ -12,90 +24,241 @@ MathsRegionType, SeparatorRegionType, NoiseRegionType, + to_xml) +from ocrd_models.ocrd_page_generateds import TableRegionType +from ocrd import Processor - to_xml +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + save_image_file, + membername ) -from ocrd import Processor -from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +TOOL = 'ocrd-tesserocr-segment-region' +LOG = getLogger('processor.TesserocrSegmentRegion') +FILEGRP_IMG = 'OCR-D-IMG-CROP' -log = getLogger('processor.TesserocrSegmentRegion') +# (will be passed as padding to both BoundingBox and GetImage) +# (actually, Tesseract honours padding only on the left and bottom, +# whereas right and top are increased less) class TesserocrSegmentRegion(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-region'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentRegion, self).__init__(*args, **kwargs) def process(self): + """Performs (text) region segmentation with Tesseract on the workspace. + + Open and deserialize PAGE input files and their respective images, + and remove any existing Region and ReadingOrder elements + (unless `overwrite_regions` is False). + + Set up Tesseract to detect blocks, and add each one to the page + as a region according to BlockType at the detected coordinates. + If `find_tables` is True, try to detect table blocks and add them + as (atomic) TableRegion. + + If `crop_polygons` is True, create a cropped (and possibly deskewed) + raw image file for each region (masked along its polygon outline), + and reference it as AlternativeImage in the region element and + as file with a fileGrp USE equal `OCR-D-IMG-CROP` in the workspace. + + Produce a new output file by serialising the resulting hierarchy. """ - Performs the region segmentation. - """ - with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: - # print(self.input_file_grp) + overwrite_regions = self.parameter['overwrite_regions'] + find_tables = self.parameter['find_tables'] + + with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: + if not find_tables: + # disable table detection here, so tables will be + # analysed as independent text/line blocks: + tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): + file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) - log.debug("Detecting regions with tesseract") - tessapi.SetImage(image) - # respect border element if present - if pcgts.get_Page().get_Border() is not None and pcgts.get_Page().get_Border().get_Coords() is not None: - border = xywh_from_points(pcgts.get_Page().get_Border().get_Coords().points) - log.debug("Explictly set page border at %s", pcgts.get_Page().get_Border().get_Coords().points) - tessapi.SetRectangle(border['x'], border['y'], border['w'], border['h']) - - # recognize the layout and the region types - it = tessapi.AnalyseLayout() - index = 0 - while it and not it.Empty(tesserocr.RIL.BLOCK): - points = points_from_x0y0x1y1(it.BoundingBox(tesserocr.RIL.BLOCK)) - - # - # the region reference in the reading order element - # - ID = "region%04d" % index - log.debug("Detected region '%s': %s", ID, points) - # - ro = pcgts.get_Page().get_ReadingOrder() - if ro is None: - ro = ReadingOrderType() - pcgts.get_Page().set_ReadingOrder(ro) - # - og = ro.get_OrderedGroup() - if og is None: - og = OrderedGroupType(id="reading-order") - ro.set_OrderedGroup(og) - # - og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index)) - - # - # region switch - # - block_type = it.BlockType() - if block_type in [tesserocr.PT.FLOWING_TEXT, tesserocr.PT.HEADING_TEXT, tesserocr.PT.PULLOUT_TEXT]: - pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points))) - elif block_type in [tesserocr.PT.FLOWING_IMAGE, tesserocr.PT.HEADING_IMAGE, tesserocr.PT.PULLOUT_IMAGE]: - pcgts.get_Page().add_ImageRegion(ImageRegionType(id=ID, Coords=CoordsType(points=points))) - elif block_type in [tesserocr.PT.HORZ_LINE, tesserocr.PT.VERT_LINE]: - pcgts.get_Page().add_SeparatorRegion(SeparatorRegionType(id=ID, Coords=CoordsType(points=points))) - elif block_type in [tesserocr.PT.INLINE_EQUATION, tesserocr.PT.EQUATION]: - pcgts.get_Page().add_MathsRegion(MathsRegionType(id=ID, Coords=CoordsType(points=points))) + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + if page.get_TextRegion(): + if overwrite_regions: + LOG.info('removing existing TextRegions') + page.set_TextRegion([]) else: - pcgts.get_Page().add_NoiseRegion(NoiseRegionType(id=ID, Coords=CoordsType(points=points))) - - # - # iterator increment - # - index += 1 - it.Next(tesserocr.RIL.BLOCK) + LOG.warning('keeping existing TextRegions') + # todo: also make non-text regions protected? + page.set_AdvertRegion([]) + page.set_ChartRegion([]) + page.set_ChemRegion([]) + page.set_GraphicRegion([]) + page.set_ImageRegion([]) + page.set_LineDrawingRegion([]) + page.set_MathsRegion([]) + page.set_MusicRegion([]) + page.set_NoiseRegion([]) + page.set_SeparatorRegion([]) + page.set_TableRegion([]) + page.set_UnknownRegion([]) + if page.get_ReadingOrder(): + if overwrite_regions: + LOG.info('overwriting existing ReadingOrder') + # (cannot sustain old regionrefs) + page.set_ReadingOrder([]) + else: + LOG.warning('keeping existing ReadingOrder') + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + LOG.info("Detecting regions in page '%s'", page_id) + tessapi.SetImage(page_image) # is already cropped to Border + tessapi.SetPageSegMode(PSM.AUTO) # (default) - ID = concat_padded(self.output_file_grp, n) + # detect the region segments and types: + layout = tessapi.AnalyseLayout() + self._process_page(layout, page, page_image, page_xywh, input_file.pageId, file_id) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename='%s/%s' % (self.output_file_grp, ID), - content=to_xml(pcgts).encode('utf-8'), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) + + def _process_page(self, it, page, page_image, page_xywh, page_id, file_id): + # equivalent to GetComponentImages with raw_image=True, + # (which would also give raw coordinates), + # except we are also interested in the iterator's BlockType() here, + # and its BlockPolygon() + index = 0 + while it and not it.Empty(RIL.BLOCK): + bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding']) + points = points_from_x0y0x1y1(bbox) + # add offset from any Border: + xywh = xywh_from_points(points) + xywh['x'] += page_xywh['x'] + xywh['y'] += page_xywh['y'] + points = points_from_xywh(xywh) + # this crashes due to tesserocr issue #184 (fixed in PR #185); + # also, sometimes these polygons are not planar (probably a + # bug in Tesseract itself): + # TODO: uncomment as soon as a merged tesserocr release is out: + # polygon = it.BlockPolygon() + # if self.parameter['crop_polygons'] and polygon and list(polygon): + # # add offset from any Border, and + # # avoid negative results (invalid in PAGE): + # polygon = [(max(0, x + page_xywh['x']), + # max(0, y + page_xywh['y'])) + # for x, y in polygon] + # points = points_from_polygon(polygon) + coords = CoordsType(points=points) + # if xywh['w'] < 30 or xywh['h'] < 30: + # LOG.info('Ignoring too small region: %s', points) + # it.Next(RIL.BLOCK) + # continue + # region_image_bin = it.GetBinaryImage(RIL.BLOCK) + # if not region_image_bin.getbbox(): + # LOG.info('Ignoring binary-empty region: %s', points) + # it.Next(RIL.BLOCK) + # continue + # + # the region reference in the reading order element + # + ID = "region%04d" % index + ro = page.get_ReadingOrder() + if not ro: + ro = ReadingOrderType() + page.set_ReadingOrder(ro) + og = ro.get_OrderedGroup() + if not og: + og = OrderedGroupType(id="reading-order") + ro.set_OrderedGroup(og) + og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index)) + # + # region type switch + # + block_type = it.BlockType() + if block_type in [PT.FLOWING_TEXT, + PT.HEADING_TEXT, + PT.PULLOUT_TEXT, + PT.CAPTION_TEXT, + # TABLE is contained in PTIsTextType, but + # it is a bad idea to create a TextRegion + # for it (better set `find_tables` False): + # PT.TABLE, + # will always yield a 90° deskew angle below: + PT.VERTICAL_TEXT]: + region = TextRegionType(id=ID, Coords=coords) + page.add_TextRegion(region) + elif block_type in [PT.FLOWING_IMAGE, + PT.HEADING_IMAGE, + PT.PULLOUT_IMAGE]: + region = ImageRegionType(id=ID, Coords=coords) + page.add_ImageRegion(region) + elif block_type in [PT.HORZ_LINE, + PT.VERT_LINE]: + region = SeparatorRegionType(id=ID, Coords=coords) + page.add_SeparatorRegion(region) + elif block_type in [PT.INLINE_EQUATION, + PT.EQUATION]: + region = MathsRegionType(id=ID, Coords=coords) + page.add_MathsRegion(region) + elif block_type == PT.TABLE: + # without API access to StructuredTable we cannot + # do much for a TableRegionType (i.e. nrows, ncols, + # coordinates of cells for recursive regions etc), + # but this could be achieved later by a specialised + # processor + region = TableRegionType(id=ID, Coords=coords) + page.add_TableRegion(region) + else: + region = NoiseRegionType(id=ID, Coords=coords) + page.add_NoiseRegion() + LOG.info("Detected region '%s': %s (%s)", ID, points, membername(PT, block_type)) + if self.parameter['crop_polygons']: + # Store the cropped (and deskewed) image for the region, + # this is not always preferable, because Tesseract tends + # to produce polygon outlines that are worse than the + # enclosing bounding boxes, and these are always used + # as mask for the image (see above). Also, it chops off + # corners when rotating against the recognised skew. + # Moreover, the mix of colour and white background + # in these images might cause binarization trouble. + # (Although against the latter we could switch to + # GetBinaryImage). + # You have been warned! + # get the raw image (masked by white space along the block polygon): + region_image, top, left = it.GetImage(RIL.BLOCK, self.parameter['padding'], page_image) + # update METS (add the image file): + file_path = save_image_file(self.workspace, region_image, + file_id + '_' + ID, + page_id=page_id, + file_grp=FILEGRP_IMG) + # update PAGE (reference the image file): + region.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments="cropped")) + # + # iterator increment + # + index += 1 + it.Next(RIL.BLOCK) diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 8b8c8d7..38b0551 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -1,54 +1,112 @@ from __future__ import absolute_import + +import os.path from tesserocr import RIL, PyTessBaseAPI, PSM + from ocrd import Processor -from ocrd_utils import getLogger, concat_padded, points_from_xywh, polygon_from_points, xywh_from_points, MIMETYPE_PAGE +from ocrd_utils import ( + getLogger, concat_padded, + points_from_xywh, + MIMETYPE_PAGE +) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, + LabelType, LabelsType, + MetadataItemType, WordType, to_xml ) from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +from .common import ( + image_from_page, + image_from_region, + image_from_line +) -log = getLogger('processor.TesserocrSegmentWord') +TOOL = 'ocrd-tesserocr-segment-word' +LOG = getLogger('processor.TesserocrSegmentWord') class TesserocrSegmentWord(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-word'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentWord, self).__init__(*args, **kwargs) def process(self): + """Performs word segmentation with Tesseract on the workspace. + + Open and deserialize PAGE input files and their respective images, + then iterate over the element hierarchy down to the textline level, + and remove any existing Word elements (unless `overwrite_words` + is False). + + Set up Tesseract to detect words, and add each one to the line + at the detected coordinates. + + Produce a new output file by serialising the resulting hierarchy. """ - Performs the line segmentation. - """ + overwrite_words = self.parameter['overwrite_words'] + with PyTessBaseAPI( psm=PSM.SINGLE_LINE, - path=TESSDATA_PREFIX, + path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) - image_url = pcgts.get_Page().imageFilename - for region in pcgts.get_Page().get_TextRegion(): + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this + # what we want here is `externalModel="ocrd-tool" externalId="parameters"` + Labels=[LabelsType(#externalRef="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page = pcgts.get_Page() + page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + page_image, page_xywh = image_from_page( + self.workspace, page, page_image, page_id) + + for region in page.get_TextRegion(): + region_image, region_xywh = image_from_region( + self.workspace, region, page_image, page_xywh) for line in region.get_TextLine(): - log.debug("Detecting words in line '%s'", line.id) - image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) - tessapi.SetImage(image) - offset = xywh_from_points(line.get_Coords().points) - for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)): + if line.get_Word(): + if overwrite_words: + LOG.info('removing existing Words in line "%s"', line.id) + line.set_Word([]) + else: + LOG.warning('keeping existing Words in line "%s"', line.id) + LOG.debug("Detecting words in line '%s'", line.id) + line_image, line_xywh = image_from_line( + self.workspace, line, region_image, region_xywh) + tessapi.SetImage(line_image) + for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] - word_xywh['x'] += offset['x'] - word_xywh['y'] += offset['y'] - line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh)))) - ID = concat_padded(self.output_file_grp, n) + word_xywh['x'] += line_xywh['x'] + word_xywh['y'] += line_xywh['y'] + word_points = points_from_xywh(word_xywh) + line.add_Word(WordType( + id=word_id, Coords=CoordsType(word_points))) + + # Use input_file's basename for the new file - + # this way the files retain the same basenames: + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( - ID=ID, + ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, - local_filename='%s/%s' % (self.output_file_grp, ID), mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts).encode('utf-8'), - ) + local_filename=os.path.join(self.output_file_grp, + file_id + '.xml'), + content=to_xml(pcgts)) diff --git a/setup.py b/setup.py index c902afe..5048ff3 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,8 @@ - ocrd_tesserocr_segment_line - ocrd_tesserocr_segment_word - ocrd_tesserocr_crop + - ocrd_tesserocr_deskew + - ocrd_tesserocr_binarize """ import codecs @@ -14,11 +16,11 @@ setup( name='ocrd_tesserocr', - version='0.2.2', + version='0.3.0', description='Tesserocr bindings', long_description=codecs.open('README.rst', encoding='utf-8').read(), - author='Konstantin Baierer', - author_email='unixprog@gmail.com', + author='Konstantin Baierer, Kay-Michael Würzner, Robert Sachunsky', + author_email='unixprog@gmail.com, wuerzner@gmail.com, sachunsky@informatik.uni-leipzig.de', url='https://github.com/OCR-D/ocrd_tesserocr', license='Apache License 2.0', packages=find_packages(exclude=('tests', 'docs')), @@ -33,6 +35,8 @@ 'ocrd-tesserocr-segment-line=ocrd_tesserocr.cli:ocrd_tesserocr_segment_line', 'ocrd-tesserocr-segment-word=ocrd_tesserocr.cli:ocrd_tesserocr_segment_word', 'ocrd-tesserocr-crop=ocrd_tesserocr.cli:ocrd_tesserocr_crop', + 'ocrd-tesserocr-deskew=ocrd_tesserocr.cli:ocrd_tesserocr_deskew', + 'ocrd-tesserocr-binarize=ocrd_tesserocr.cli:ocrd_tesserocr_binarize', ] }, )