Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port to v3 #44

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
901098a
bump requirement to ocrd >= 3.0.0a1
kba Aug 11, 2024
78849a9
port binarize to v3
kba Aug 11, 2024
30db9a4
port segment to v3
kba Aug 11, 2024
9ea80c7
port recognize to v3
kba Aug 11, 2024
163ee7d
ocrd-tool.json: add cardinality specs
bertsky Aug 13, 2024
41b0045
test_binarize.py: use stable API
bertsky Aug 13, 2024
340f513
test_recognize.py: use stable API
bertsky Aug 13, 2024
cd0ce01
test_segment.py: use stable API
bertsky Aug 13, 2024
4671e98
remove fileGrp cardinality assertions
bertsky Aug 13, 2024
a497287
binarize: re-instate setup for logger
bertsky Aug 14, 2024
c0c1eb7
adapt to bertsky/core#8
kba Aug 14, 2024
712d1d3
Merge branch 'port-to-v3' of https://github.com/OCR-D/ocrd_kraken int…
kba Aug 14, 2024
e8ec7fe
require regex
kba Aug 15, 2024
e76d708
update to OcrdPageResult change
kba Aug 15, 2024
2832722
update to latest OcrdPageResult and process_page_pcgts
kba Aug 15, 2024
a8a859b
CI: switch back to Ubuntu
bertsky Aug 15, 2024
0e30138
self.logger: adapt to bertsky/core#10
kba Aug 19, 2024
6d287b0
tests: migrate unittest→pytest, simplify
bertsky Aug 29, 2024
316eedb
tests: base→conftest
bertsky Aug 29, 2024
43c600f
tests: also w/ METS server + page-parallel and w/ METS caching
bertsky Aug 30, 2024
32b2e9c
remove v2 tool facility
bertsky Aug 30, 2024
c73b3ef
tests: use workspace manifesto→aufklaerung (1→2 pages), binarize ad h…
bertsky Aug 30, 2024
a23d4c3
tests: avoid running into 'too many failures'
bertsky Aug 30, 2024
ae6445b
update v3 requirement
bertsky Aug 30, 2024
fd15e2a
tests: add actual assertions
bertsky Aug 30, 2024
43a88ea
update v3 requirement
bertsky Aug 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ jobs:
#
# Related issue: https://github.com/actions/runner-images/issues/672.
# runs-on: ubuntu-latest
runs-on: macos-latest
# runs-on: macos-latest
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ docker:

# Run test
test: tests/assets
$(PYTHON) -m pytest tests $(PYTEST_ARGS)
$(PYTHON) -m pytest tests --durations=0 $(PYTEST_ARGS)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And with this we get to see what difference in performance these settings make:

93.35s call     tests/test_recognize.py::test_recognize[pageparallel+metscache]
92.28s call     tests/test_recognize.py::test_recognize[pageparallel]
76.19s call     tests/test_recognize.py::test_recognize[]
74.83s call     tests/test_recognize.py::test_recognize[metscache]
55.92s call     tests/test_segment.py::test_run_blla[metscache]
55.11s call     tests/test_segment.py::test_run_blla[]
48.43s call     tests/test_segment.py::test_run_blla[pageparallel+metscache]
41.80s call     tests/test_segment.py::test_run_blla[pageparallel]

(In this case, it was only 2 pages – the scaling factor is not so great.)


#
# Assets
Expand Down
118 changes: 46 additions & 72 deletions ocrd_kraken/binarize.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
from __future__ import absolute_import
import os
from os.path import join
from typing import Optional

from ocrd.processor.base import OcrdPageResult
from ocrd.processor.ocrd_page_result import OcrdPageResultImage

import kraken.binarization
from ocrd import Processor
from ocrd_utils import getLogger, make_file_id, MIMETYPE_PAGE
from ocrd_models.ocrd_page import AlternativeImageType, to_xml
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id, MIMETYPE_PAGE
from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
from ocrd_modelfactory import page_from_file

from ocrd_kraken.config import OCRD_TOOL


class KrakenBinarize(Processor):

def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-binarize']
kwargs['version'] = OCRD_TOOL['version']
super(KrakenBinarize, self).__init__(*args, **kwargs)
@property
def executable(self):
return 'ocrd-kraken-binarize'

def process(self):
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
"""Binarize the pages/regions/lines with Kraken.

Open and deserialise PAGE input files and their respective images,
then iterate over the element hierarchy down to the requested
Iterate over the input PAGE element hierarchy down to the requested
``level-of-operation``.

Next, for each file, crop each segment image according to the layout
Expand All @@ -36,64 +37,37 @@ def process(self):

Produce a new output file by serialising the resulting hierarchy.
"""
log = getLogger('processor.KrakenBinarize')
log.debug('Level of operation: "%s"', self.parameter['level-of-operation'])
log.debug('Input file group %s', self.input_file_grp)
log.debug('Input files %s', [str(f) for f in self.input_files])
for (n, input_file) in enumerate(self.input_files):
log.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
file_id = make_file_id(input_file, self.output_file_grp)
pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page()
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
self.add_metadata(pcgts)
assert self.workspace
assert self.output_file_grp
self.logger.debug('Level of operation: "%s"', self.parameter['level-of-operation'])

page_image, page_coords, page_image_info = self.workspace.image_from_page(
page, page_id, feature_filter='binarized')
if self.parameter['level-of-operation'] == 'page':
log.info("Binarizing page '%s'", page_id)
bin_image = kraken.binarization.nlbin(page_image)
file_path = self.workspace.save_image_file(
bin_image, file_id + '.IMG-BIN',
self.output_file_grp,
page_id=input_file.pageId)
page.add_AlternativeImage(AlternativeImageType(
filename=file_path,
comments=page_coords['features'] + ',binarized'))
else:
for region in page.get_AllRegions(classes=['Text']):
region_image, region_coords = self.workspace.image_from_segment(
region, page_image, page_coords, feature_filter='binarized')
if self.parameter['level-of-operation'] == 'region':
log.info("Binarizing region '%s'", region.id)
bin_image = kraken.binarization.nlbin(region_image)
file_path = self.workspace.save_image_file(
bin_image, file_id + '_' + region.id + '.IMG-BIN',
self.output_file_grp,
page_id=input_file.pageId)
region.add_AlternativeImage(AlternativeImageType(
filename=file_path,
comments=region_coords['features'] + ',binarized'))
else:
for line in region.get_TextLine():
line_image, line_coords = self.workspace.image_from_segment(
line, region_image, region_coords, feature_filter='binarized')
log.info("Binarizing line '%s'", line.id)
bin_image = kraken.binarization.nlbin(line_image)
file_path = self.workspace.save_image_file(
bin_image, file_id + '_' + region.id + '_' + line.id + '.IMG-BIN',
self.output_file_grp,
page_id=input_file.pageId)
line.add_AlternativeImage(AlternativeImageType(
filename=file_path,
comments=line_coords['features'] + ',binarized'))
# update METS (add the PAGE file):
file_path = os.path.join(self.output_file_grp, file_id + '.xml')
pcgts.set_pcGtsId(file_id)
out = self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
local_filename=file_path,
mimetype=MIMETYPE_PAGE,
content=to_xml(pcgts))
pcgts = input_pcgts[0]
assert pcgts
page = pcgts.get_Page()
assert page
page_image, page_xywh, _ = self.workspace.image_from_page(
page, page_id, feature_filter='binarized')
result = OcrdPageResult(pcgts)
if self.parameter['level-of-operation'] == 'page':
self.logger.info("Binarizing page '%s'", page_id)
alternative_image = AlternativeImageType(comments=f'{page_xywh["features"]},binarized')
page.add_AlternativeImage(alternative_image)
result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(page_image), '.IMG-BIN', alternative_image))
else:
for region in page.get_AllRegions(classes=['Text']):
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh, feature_filter='binarized')
if self.parameter['level-of-operation'] == 'region':
self.logger.info("Binarizing region '%s'", region.id)
alternative_image = AlternativeImageType(comments=f'{region_xywh["features"]},binarized')
region.add_AlternativeImage(alternative_image)
result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(region_image), f'{region.id}.IMG-BIN', alternative_image))
else:
for line in region.get_TextLine():
line_image, line_xywh = self.workspace.image_from_segment(
line, region_image, region_xywh, feature_filter='binarized')
self.logger.info("Binarizing line '%s'", line.id)
alternative_image = AlternativeImageType(comments=f'{line_xywh["features"]},binarized')
line.add_AlternativeImage(alternative_image)
result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(line_image), f'{region.id}_{line.id}.IMG-BIN', alternative_image))
return result
10 changes: 0 additions & 10 deletions ocrd_kraken/cli.py

This file was deleted.

5 changes: 0 additions & 5 deletions ocrd_kraken/config.py

This file was deleted.

12 changes: 6 additions & 6 deletions ocrd_kraken/ocrd-tool.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
"tools": {
"ocrd-kraken-binarize": {
"executable": "ocrd-kraken-binarize",
"input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-SEG-REGION", "OCR-D-SEG-LINE"],
"output_file_grp": ["OCR-D-PRE-BIN"],
"input_file_grp_cardinality": 1,
"output_file_grp_cardinality": 1,
"categories": [
"Image preprocessing"
],
Expand All @@ -24,8 +24,8 @@
},
"ocrd-kraken-segment": {
"executable": "ocrd-kraken-segment",
"input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-PRE-BIN"],
"output_file_grp": ["OCR-D-SEG-REGION", "OCR-D-SEG-LINE"],
"input_file_grp_cardinality": 1,
"output_file_grp_cardinality": 1,
"categories": [
"Layout analysis"
],
Expand Down Expand Up @@ -128,8 +128,8 @@
},
"ocrd-kraken-recognize": {
"executable": "ocrd-kraken-recognize",
"input_file_grp": ["OCR-D-SEG-LINE"],
"output_file_grp": ["OCR-D-OCR-KRAK"],
"input_file_grp_cardinality": 1,
"output_file_grp_cardinality": 1,
"categories": ["Text recognition and optimization"],
"steps": ["recognition/text-recognition"],
"description": "Text recognition with Kraken",
Expand Down
Loading
Loading