Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement Alternativeimage-based processing #48

Merged
merged 6 commits into from
Jul 3, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,22 @@ ignored-modules=cv2,tesserocr
[MESSAGES CONTROL]
disable =
ungrouped-imports,
fixme,
bad-continuation,
missing-docstring,
no-self-use,
too-many-arguments,
superfluous-parens,
invalid-name,
line-too-long,
too-many-arguments,
too-many-branches,
too-many-statements,
too-many-locals,
too-few-public-methods,
wrong-import-order,
duplicate-code

# allow indented whitespace (as required by interpreter):
no-space-check=empty-line

# allow non-snake-case identifiers:
good-names=n,i
20 changes: 20 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,26 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

## [0.2.3] - 2019-06-28

Changed:
* Use basename of input file for output name
* Use .xml filename extension for PAGE output
* Warn about existing border or regions in `crop`
* Use `PSM.SPARSE_TEXT` without tables in `crop`
* Filter unreliable regions in `crop`
* Add padding around border in `crop`
* Delete existing regions in `segment_region`
* Cover vertical text and tables in `segment_region`
* Add parameter `find_tables` in `segment_region`
* Add parameter `crop_polygons` in `segment_region`
* Add parameter `overwrite_regions` in `segment_region`
* Add parameter `overwrite_lines` in `segment_line`
* Add parameter `overwrite_words` in `segment_word`
* Add page/region-level processor `deskew`
* Add page/region/line-level processor `binarize`
* Respect AlternativeImage on all levels

## [0.2.2] - 2019-05-20

Changed:
Expand Down
2 changes: 2 additions & 0 deletions ocrd_tesserocr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@
from .segment_line import TesserocrSegmentLine
from .segment_region import TesserocrSegmentRegion
from .crop import TesserocrCrop
from .deskew import TesserocrDeskew
from .binarize import TesserocrBinarize
139 changes: 139 additions & 0 deletions ocrd_tesserocr/binarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from __future__ import absolute_import

import os.path
from tesserocr import (
PyTessBaseAPI,
PSM, RIL
)

from ocrd_utils import (
getLogger, concat_padded,
MIMETYPE_PAGE
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
MetadataItemType,
LabelsType, LabelType,
AlternativeImageType,
TextRegionType,
to_xml
)
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
image_from_page,
image_from_region,
image_from_line,
save_image_file,
membername
)

TOOL = 'ocrd-tesserocr-binarize'
LOG = getLogger('processor.TesserocrBinarize')
FILEGRP_IMG = 'OCR-D-IMG-BIN'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can only be a temporary solution (cf. OCR-D/spec#117).


class TesserocrBinarize(Processor):

def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(TesserocrBinarize, self).__init__(*args, **kwargs)

def process(self):
"""Performs binarization with Tesseract on the workspace.

Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the requested level.

Set up Tesseract to recognize the segment image's layout, and get
the binarized image. Create an image file, and reference it as
AlternativeImage in the element and as file with a fileGrp USE
equal `OCR-D-IMG-BIN` in the workspace.

Produce a new output file by serialising the resulting hierarchy.
"""
oplevel = self.parameter['operation_level']
with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for n, input_file in enumerate(self.input_files):
file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG)
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
# FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
# what we want here is `externalModel="ocrd-tool" externalId="parameters"`
Labels=[LabelsType(#externalRef="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image = self.workspace.resolve_image_as_pil(page.imageFilename)
LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)

page_image, page_xywh = image_from_page(
self.workspace, page, page_image, page_id)
if oplevel == 'page':
tessapi.SetPageSegMode(PSM.AUTO)
self._process_segment(tessapi, RIL.BLOCK, page, page_image, page_xywh,
"page '%s'" % page_id, input_file.pageId,
file_id)
else:
regions = page.get_TextRegion() + page.get_TableRegion()
if not regions:
LOG.warning("Page '%s' contains no text regions", page_id)
for region in regions:
region_image, region_xywh = image_from_region(
self.workspace, region, page_image, page_xywh)
if oplevel == 'region':
tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh,
"region '%s'" % region.id, input_file.pageId,
file_id + '_' + region.id)
elif isinstance(region, TextRegionType):
lines = region.get_TextLine()
if not lines:
LOG.warning("Page '%s' region '%s' contains no text lines",
page_id, region.id)
for line in lines:
line_image, line_xywh = image_from_line(
self.workspace, line, region_image, region_xywh)
tessapi.SetPageSegMode(PSM.SINGLE_LINE)
self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh,
"line '%s'" % line.id, input_file.pageId,
file_id + '_' + region.id + '_' + line.id)

# Use input_file's basename for the new file -
# this way the files retain the same basenames:
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
if file_id == input_file.ID:
file_id = concat_padded(self.output_file_grp, n)
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
mimetype=MIMETYPE_PAGE,
local_filename=os.path.join(self.output_file_grp,
file_id + '.xml'),
content=to_xml(pcgts))

def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, file_id):
tessapi.SetImage(image)
image_bin = None
layout = tessapi.AnalyseLayout()
if layout:
image_bin = layout.GetBinaryImage(ril)
if not image_bin:
LOG.error('Cannot binarize %s', where)
return
# update METS (add the image file):
file_path = save_image_file(self.workspace, image_bin,
file_id,
page_id=page_id,
file_grp=FILEGRP_IMG)
# update PAGE (reference the image file):
segment.add_AlternativeImage(AlternativeImageType(
filename=file_path, comments="binarized"))
12 changes: 12 additions & 0 deletions ocrd_tesserocr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from ocrd_tesserocr.segment_line import TesserocrSegmentLine
from ocrd_tesserocr.segment_word import TesserocrSegmentWord
from ocrd_tesserocr.crop import TesserocrCrop
from ocrd_tesserocr.deskew import TesserocrDeskew
from ocrd_tesserocr.binarize import TesserocrBinarize

@click.command()
@ocrd_cli_options
Expand All @@ -31,3 +33,13 @@ def ocrd_tesserocr_recognize(*args, **kwargs):
@ocrd_cli_options
def ocrd_tesserocr_crop(*args, **kwargs):
return ocrd_cli_wrap_processor(TesserocrCrop, *args, **kwargs)

@click.command()
@ocrd_cli_options
def ocrd_tesserocr_deskew(*args, **kwargs):
return ocrd_cli_wrap_processor(TesserocrDeskew, *args, **kwargs)

@click.command()
@ocrd_cli_options
def ocrd_tesserocr_binarize(*args, **kwargs):
return ocrd_cli_wrap_processor(TesserocrBinarize, *args, **kwargs)
Loading