From 901098add604124bc45e5663908e80c6b04f7852 Mon Sep 17 00:00:00 2001 From: kba Date: Sun, 11 Aug 2024 14:42:41 +0200 Subject: [PATCH 01/25] bump requirement to ocrd >= 3.0.0a1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6bbfb40..8c70309 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 2.65 +ocrd >= 3.0.0a1 kraken >= 5.0 scipy shapely From 78849a9db2a14e83f0651009c659de47ce6a4be8 Mon Sep 17 00:00:00 2001 From: kba Date: Sun, 11 Aug 2024 14:42:54 +0200 Subject: [PATCH 02/25] port binarize to v3 --- ocrd_kraken/binarize.py | 121 +++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 69 deletions(-) diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py index 7c53940..809a919 100644 --- a/ocrd_kraken/binarize.py +++ b/ocrd_kraken/binarize.py @@ -1,9 +1,11 @@ from __future__ import absolute_import import os +from os.path import join +from typing import Optional import kraken.binarization from ocrd import Processor -from ocrd_utils import getLogger, make_file_id, MIMETYPE_PAGE -from ocrd_models.ocrd_page import AlternativeImageType, to_xml +from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id, MIMETYPE_PAGE +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml from ocrd_modelfactory import page_from_file from ocrd_kraken.config import OCRD_TOOL @@ -11,16 +13,19 @@ class KrakenBinarize(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-binarize'] - kwargs['version'] = OCRD_TOOL['version'] - super(KrakenBinarize, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-kraken-binarize' - def process(self): + def setup(self): + self.logger = getLogger('processor.KrakenBinarize') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + + def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: """Binarize the pages/regions/lines with Kraken. - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested + Iterate over the input PAGE element hierarchy down to the requested ``level-of-operation``. Next, for each file, crop each segment image according to the layout @@ -36,64 +41,42 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - log = getLogger('processor.KrakenBinarize') - log.debug('Level of operation: "%s"', self.parameter['level-of-operation']) - log.debug('Input file group %s', self.input_file_grp) - log.debug('Input files %s', [str(f) for f in self.input_files]) - for (n, input_file) in enumerate(self.input_files): - log.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - pcgts = page_from_file(self.workspace.download_file(input_file)) - page = pcgts.get_Page() - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - self.add_metadata(pcgts) + assert self.workspace + assert self.output_file_grp + self.logger.debug('Level of operation: "%s"', self.parameter['level-of-operation']) - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_filter='binarized') - if self.parameter['level-of-operation'] == 'page': - log.info("Binarizing page '%s'", page_id) - bin_image = kraken.binarization.nlbin(page_image) - file_path = self.workspace.save_image_file( - bin_image, file_id + '.IMG-BIN', - self.output_file_grp, - page_id=input_file.pageId) - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=page_coords['features'] + ',binarized')) - else: - for region in page.get_AllRegions(classes=['Text']): - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_filter='binarized') - if self.parameter['level-of-operation'] == 'region': - log.info("Binarizing region '%s'", region.id) - bin_image = kraken.binarization.nlbin(region_image) - file_path = self.workspace.save_image_file( - bin_image, file_id + '_' + region.id + '.IMG-BIN', - self.output_file_grp, - page_id=input_file.pageId) - region.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=region_coords['features'] + ',binarized')) - else: - for line in region.get_TextLine(): - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords, feature_filter='binarized') - log.info("Binarizing line '%s'", line.id) - bin_image = kraken.binarization.nlbin(line_image) - file_path = self.workspace.save_image_file( - bin_image, file_id + '_' + region.id + '_' + line.id + '.IMG-BIN', - self.output_file_grp, - page_id=input_file.pageId) - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=line_coords['features'] + ',binarized')) - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page + page_image, page_xywh, _ = self.workspace.image_from_page( + page, page_id, feature_filter='binarized') + ret = [pcgts] + if self.parameter['level-of-operation'] == 'page': + self.logger.info("Binarizing page '%s'", page_id) + bin_image = kraken.binarization.nlbin(page_image) + bin_image_id = f'{output_file_id}.IMG-BIN' + bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') + page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{page_xywh["features"]},binarized')) + ret.append((bin_image, bin_image_id, bin_image_path)) + else: + for region in page.get_AllRegions(classes=['Text']): + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_filter='binarized') + if self.parameter['level-of-operation'] == 'region': + self.logger.info("Binarizing region '%s'", region.id) + bin_image = kraken.binarization.nlbin(region_image) + bin_image_id = f'{output_file_id}_{region.id}.IMG-BIN' + bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') + region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{region_xywh["features"]},binarized')) + ret.append((bin_image, bin_image_id, bin_image_path)) + else: + for line in region.get_TextLine(): + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, feature_filter='binarized') + self.logger.info("Binarizing line '%s'", line.id) + bin_image = kraken.binarization.nlbin(line_image) + bin_image_id = f'{output_file_id}_{region.id}_{line.id}.IMG-BIN' + bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') + line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{page_xywh["features"]},binarized')) + ret.append((bin_image, bin_image_id, bin_image_path)) + return ret From 30db9a4f76578e3926a6b0f5a375ff326277519b Mon Sep 17 00:00:00 2001 From: kba Date: Sun, 11 Aug 2024 14:55:19 +0200 Subject: [PATCH 03/25] port segment to v3 --- ocrd_kraken/segment.py | 157 +++++++++++++++++++---------------------- 1 file changed, 71 insertions(+), 86 deletions(-) diff --git a/ocrd_kraken/segment.py b/ocrd_kraken/segment.py index 14e19dc..bdce58f 100644 --- a/ocrd_kraken/segment.py +++ b/ocrd_kraken/segment.py @@ -1,3 +1,4 @@ +from typing import Optional from PIL import ImageOps from os.path import join @@ -16,6 +17,7 @@ ) import ocrd_models.ocrd_page from ocrd_models.ocrd_page import ( + OcrdPage, PageType, BorderType, TextRegionType, @@ -34,19 +36,17 @@ class KrakenSegment(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-segment'] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() + @property + def executable(self): + return 'ocrd-kraken-segment' def setup(self): """ Load models """ - self.log = getLogger('processor.KrakenSegment') + self.logger = getLogger('processor.KrakenSegment') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) kwargs = {} kwargs['text_direction'] = self.parameter['text_direction'] self.use_legacy = self.parameter['use_legacy'] @@ -55,29 +55,28 @@ def setup(self): kwargs['scale'] = self.parameter['scale'] kwargs['maxcolseps'] = self.parameter['maxcolseps'] kwargs['black_colseps'] = self.parameter['black_colseps'] - self.log.info("Using legacy segmenter") + self.logger.info("Using legacy segmenter") else: from kraken.lib.vgsl import TorchVGSLModel from kraken.blla import segment - self.log.info("Using blla segmenter") + self.logger.info("Using blla segmenter") blla_model_fname = self.resolve_resource(self.parameter['blla_model']) kwargs['model'] = TorchVGSLModel.load_model(blla_model_fname) device = self.parameter['device'] if device != 'cpu' and not torch.cuda.is_available(): device = 'cpu' if device == 'cpu': - self.log.warning("no CUDA device available. Running without GPU will be slow") + self.logger.warning("no CUDA device available. Running without GPU will be slow") kwargs['device'] = device def segmenter(img, mask=None): return segment(img, mask=mask, **kwargs) self.segmenter = segmenter - def process(self): + def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: """Segment into (regions and) lines with Kraken. - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the ``level-of-operation``, - i.e.: + Iterate over the element hierarchy of the PAGE-XML down to the + ``level-of-operation``, i.e.: \b - On `page` level and `table` level, detect text regions and lines @@ -96,68 +95,54 @@ def process(self): Then compute a segmentation and decode it into new (text regions and) lines, and append them to the parent segment. - Produce a new output file by serialising the resulting hierarchy. + Return the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - self.log.info("INPUT FILE %i / %s of %s", n, page_id, len(self.input_files)) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - page_image, page_coords, page_info = self.workspace.image_from_page( - page, page_id, - feature_selector="binarized" if self.use_legacy else "") - if page_info.resolution != 1: - dpi = page_info.resolution - if page_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - zoom = 300.0 / dpi - else: - zoom = 1.0 - # TODO: be DPI-relative + pcgts = input_pcgts[0] + page = pcgts.get_Page() + page_image, page_coords, page_info = self.workspace.image_from_page( + page, page_id, + feature_selector="binarized" if self.use_legacy else "") + if page_info.resolution != 1: + dpi = page_info.resolution + if page_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + zoom = 300.0 / dpi + else: + zoom = 1.0 + # TODO: be DPI-relative - if self.parameter['level-of-operation'] == 'page': - self.log.info('Segmenting page with %s segmenter', 'legacy' if self.use_legacy else 'blla') + if self.parameter['level-of-operation'] == 'page': + self.logger.info('Segmenting page with %s segmenter', 'legacy' if self.use_legacy else 'blla') + if self.parameter['overwrite_segments']: + page.TextRegion = [] + elif len(page.TextRegion or []): + self.logger.warning('Keeping %d text regions on page "%s"', len(page.TextRegion or []), page.id) + self._process_page(page_image, page_coords, page, zoom) + elif self.parameter['level-of-operation'] == 'table': + regions = page.get_AllRegions(classes=['Table']) + if not regions: + self.logger.warning('No existing table regions on page "%s"', page_id) + for region in regions: + self.logger.info('Segmenting table region "%s" with %s segmenter', region.id, 'legacy' if self.use_legacy else 'blla') if self.parameter['overwrite_segments']: - page.TextRegion = [] - elif len(page.TextRegion or []): - self.log.warning('Keeping %d text regions on page "%s"', len(page.TextRegion or []), page.id) - self._process_page(page_image, page_coords, page, zoom) - elif self.parameter['level-of-operation'] == 'table': - regions = page.get_AllRegions(classes=['Table']) - if not regions: - self.log.warning('No existing table regions on page "%s"', page_id) - for region in regions: - self.log.info('Segmenting table region "%s" with %s segmenter', region.id, 'legacy' if self.use_legacy else 'blla') - if self.parameter['overwrite_segments']: - region.TextRegion = [] - elif len(region.TextRegion or []): - self.log.warning('Keeping %d text regions in region "%s"', len(region.TextRegion or []), region.id) - self._process_page(page_image, page_coords, region, zoom) - else: - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.log.warning('No existing text regions on page "%s"', page_id) - for region in regions: - self.log.info('Segmenting text region "%s" with %s segmenter', region.id, 'legacy' if self.use_legacy else 'blla') - if self.parameter['overwrite_segments']: - region.TextLine = [] - elif len(region.TextLine or []): - self.log.warning('Keeping %d lines in region "%s"', len(region.TextLine or []), region.id) - self._process_region(page_image, page_coords, region, zoom) + region.TextRegion = [] + elif len(region.TextRegion or []): + self.logger.warning('Keeping %d text regions in region "%s"', len(region.TextRegion or []), region.id) + self._process_page(page_image, page_coords, region, zoom) + else: + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning('No existing text regions on page "%s"', page_id) + for region in regions: + self.logger.info('Segmenting text region "%s" with %s segmenter', region.id, 'legacy' if self.use_legacy else 'blla') + if self.parameter['overwrite_segments']: + region.TextLine = [] + elif len(region.TextLine or []): + self.logger.warning('Keeping %d lines in region "%s"', len(region.TextLine or []), region.id) + self._process_region(page_image, page_coords, region, zoom) - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, f'{file_id}.xml'), - content=to_xml(pcgts)) + return pcgts def _process_page(self, page_image, page_coords, page, zoom=1.0): def getmask(): @@ -192,15 +177,15 @@ def getmask(): # poly = geom.Polygon(poly).buffer(20/zoom).exterior.coords[:-1] mask = ImageOps.invert(polygon_mask(page_image, poly)) for region in regions: - self.log.info("Masking existing region %s", region.id) + self.logger.info("Masking existing region %s", region.id) poly = coordinates_of_segment(region, page_image, page_coords) # poly = geom.Polygon(poly).buffer(20/zoom).exterior.coords[:-1] mask.paste(255, mask=polygon_mask(page_image, poly)) return mask res = self.segmenter(page_image, mask=getmask()) - self.log.debug("Finished segmentation, serializing") + self.logger.debug("Finished segmentation, serializing") if self.use_legacy: - self.log.debug(res) + self.logger.debug(res) idx_line = 0 for idx_line, line in enumerate(res.lines): line_poly = polygon_from_x0y0x1y1(line.bbox) @@ -213,9 +198,9 @@ def getmask(): id=f'region_line_{idx_line + 1}_line', Coords=CoordsType(points=line_points))) page.add_TextRegion(region_elem) - self.log.debug("Found %d lines on page %s", idx_line + 1, page.id) + self.logger.debug("Found %d lines on page %s", idx_line + 1, page.id) else: - self.log.debug(res) + self.logger.debug(res) handled_lines = {} regions = [(type_, region) for type_ in res.regions @@ -239,11 +224,11 @@ def getmask(): line_baseline = coordinates_for_segment(line.baseline, None, page_coords) line_id = f'region_{idx_region + 1}_line_{idx_line + 1}' line_type = line.tags.get('type', '') - self.log.info("Line %s is of type %s", line_id, line_type) + self.logger.info("Line %s is of type %s", line_id, line_type) line_poly = make_valid(geom.Polygon(line_poly)) if region_poly.contains(line_poly): if idx_line in handled_lines: - self.log.error("Line %s was already added to region %s" % (idx_line, handled_lines[idx_line])) + self.logger.error("Line %s was already added to region %s" % (idx_line, handled_lines[idx_line])) continue region_elem.add_TextLine(TextLineType( id=line_id, @@ -252,12 +237,12 @@ def getmask(): handled_lines[idx_line] = idx_region for idx_line, line in enumerate(res.lines): if idx_line not in handled_lines: - self.log.error("Line %s could not be assigned a region, creating a dummy region", idx_line) + self.logger.error("Line %s could not be assigned a region, creating a dummy region", idx_line) line_poly = coordinates_for_segment(line.boundary, None, page_coords) line_baseline = coordinates_for_segment(line.baseline, None, page_coords) line_id = f'region_line_{idx_line + 1}_line' line_type = line.tags.get('type', '') - self.log.info("Line %s is of type %s", line_id, line_type) + self.logger.info("Line %s is of type %s", line_id, line_type) line_poly = make_valid(geom.Polygon(line_poly)).exterior.coords[:-1] region_elem = TextRegionType( id='region_line_%s' % (idx_line + 1), @@ -267,7 +252,7 @@ def getmask(): Baseline=BaselineType(points=points_from_polygon(line_baseline)), Coords=CoordsType(points=points_from_polygon(line_poly)))) page.add_TextRegion(region_elem) - self.log.debug("Found %d lines and %d regions on page %s", idx_line + 1, idx_region + 1, page.id) + self.logger.debug("Found %d lines and %d regions on page %s", idx_line + 1, idx_region + 1, page.id) def _process_region(self, page_image, page_coords, region, zoom=1.0): def getmask(): @@ -275,13 +260,13 @@ def getmask(): poly = geom.Polygon(poly).buffer(20/zoom).exterior.coords[:-1] mask = ImageOps.invert(polygon_mask(page_image, poly)) for line in region.TextLine: - self.log.info("Masking existing line %s", line.id) + self.logger.info("Masking existing line %s", line.id) poly = coordinates_of_segment(line, page_image, page_coords) # poly = geom.Polygon(poly).buffer(20/zoom).exterior.coords[:-1] mask.paste(255, mask=polygon_mask(page_image, poly)) return mask res = self.segmenter(page_image, mask=getmask()) - self.log.debug("Finished segmentation, serializing") + self.logger.debug("Finished segmentation, serializing") idx_line = 0 if self.use_legacy: for idx_line, line in enumerate(res.lines): @@ -297,7 +282,7 @@ def getmask(): line_baseline = coordinates_for_segment(line.baseline, None, page_coords) line_id = f'{region.id}_line_{idx_line + 1}' line_type = line.tags.get('type', '') - self.log.info("Line %s is of type %s", line_id, line_type) + self.logger.info("Line %s is of type %s", line_id, line_type) line_poly = geom.Polygon(line_poly) #line_poly = line_poly.intersection(region_poly) line_poly = make_valid(line_poly).exterior.coords[:-1] @@ -305,7 +290,7 @@ def getmask(): id=line_id, Baseline=BaselineType(points=points_from_polygon(line_baseline)), Coords=CoordsType(points=points_from_polygon(line_poly)))) - self.log.debug("Found %d lines in region %s", idx_line + 1, region.id) + self.logger.debug("Found %d lines in region %s", idx_line + 1, region.id) def make_valid(polygon): for split in range(1, len(polygon.exterior.coords)-1): From 9ea80c74f947d5c7ede2cd6bd2fa6c4bfe6e811a Mon Sep 17 00:00:00 2001 From: kba Date: Sun, 11 Aug 2024 14:55:30 +0200 Subject: [PATCH 04/25] port recognize to v3 --- ocrd_kraken/recognize.py | 283 +++++++++++++++++++-------------------- 1 file changed, 135 insertions(+), 148 deletions(-) diff --git a/ocrd_kraken/recognize.py b/ocrd_kraken/recognize.py index 2e2ed1d..a0773d1 100644 --- a/ocrd_kraken/recognize.py +++ b/ocrd_kraken/recognize.py @@ -1,4 +1,5 @@ from os.path import join +from typing import Union import regex import itertools import numpy as np @@ -24,6 +25,7 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( + OcrdPage, RegionRefType, RegionRefIndexedType, OrderedGroupType, @@ -46,29 +48,28 @@ class KrakenRecognize(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-recognize'] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() + @property + def executable(self): + return 'ocrd-kraken-recognize' def setup(self): """ - Load models + Assert filegrp cardinality, load model, set predict function """ - log = getLogger('processor.KrakenRecognize') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + + self.logger = getLogger('processor.KrakenRecognize') import torch from kraken.rpred import rpred from kraken.lib.models import load_any model_fname = self.resolve_resource(self.parameter['model']) - log.info("loading model '%s'", model_fname) + self.logger.info("loading model '%s'", model_fname) device = self.parameter['device'] if device != 'cpu' and not torch.cuda.is_available(): device = 'cpu' if device == 'cpu': - log.warning("no CUDA device available. Running without GPU will be slow") + self.logger.warning("no CUDA device available. Running without GPU will be slow") self.model = load_any(model_fname, device=device) def predict(page_image, segmentation): return rpred(self.model, page_image, segmentation, @@ -76,11 +77,11 @@ def predict(page_image, segmentation): self.parameter['bidi_reordering']) self.predict = predict - def process(self): + def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: """Recognize text on lines with Kraken. - Open and deserialise each PAGE input file and its respective image, - then iterate over the element hierarchy down to the line level. + Open the parsed PAGE-XML file, then iterate over the element hierarchy + down to the line level. Set up Kraken to recognise each text line (via coordinates into the higher-level image, or from the alternative image. If the model @@ -94,149 +95,135 @@ def process(self): into additional TextEquiv at each level, and make the higher levels consistent with that (by concatenation joined by whitespace). - Produce a new output file by serialising the resulting hierarchy. + Return the resulting hierarchy. """ + assert self.workspace from kraken.containers import Segmentation, BaselineLine, BBoxLine - log = getLogger('processor.KrakenRecognize') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - log.info("INPUT FILE %i / %s of %s", n, page_id, len(self.input_files)) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id, - feature_selector="binarized" - if self.model.nn.input[1] == 1 and self.model.one_channel_mode == '1' - else '') - page_rect = Rectangle(0, 0, page_image.width - 1, page_image.height - 1) - # todo: find out whether kraken.lib.xml.XMLPage(...).to_container() is adequate + pcgts = input_pcgts[0] + page = pcgts.get_Page() + page_image, page_coords, _ = self.workspace.image_from_page( + page, page_id, + feature_selector="binarized" + if self.model.nn.input[1] == 1 and self.model.one_channel_mode == '1' + else '') + page_rect = Rectangle(0, 0, page_image.width - 1, page_image.height - 1) + # TODO: find out whether kraken.lib.xml.XMLPage(...).to_container() is adequate - all_lines = page.get_AllTextLines() - # assumes that missing baselines are rare, if any - if any(line.Baseline for line in all_lines): - log.info("Converting PAGE to Kraken Segmentation (baselines)") - segtype = 'baselines' - else: - log.info("Converting PAGE to Kraken Segmentation (boxes only)") - segtype = 'bbox' - scale = 0.5 * np.median([xywh_from_points(line.Coords.points)['h'] for line in all_lines]) - log.info("Estimated scale: %.1f", scale) - seglines = [] - for line in all_lines: - # FIXME: see whether model prefers baselines or bbox crops (seg_type) - # FIXME: even if we do not have baselines, emulating baseline+boundary might be useful to prevent automatic center normalization - poly = coordinates_of_segment(line, None, page_coords) - poly = make_valid(Polygon(poly)) - poly = poly.intersection(page_rect) - if segtype == 'baselines': - if line.Baseline is None: - base = dummy_baseline_of_segment(line, page_coords) - else: - base = baseline_of_segment(line, page_coords) - if len(base) < 2 or np.abs(np.mean(base[0] - base[-1])) <= 1: - base = dummy_baseline_of_segment(line, page_coords) - elif not LineString(base).intersects(poly): - base = dummy_baseline_of_segment(line, page_coords) - # kraken expects baseline to be fully contained in boundary - base = LineString(base) - if poly.is_empty: - poly = polygon_from_baseline(base, scale=scale) - elif not base.within(poly): - poly = join_polygons([poly, polygon_from_baseline(base, scale=scale)], - loc=line.id, scale=scale) - seglines.append(BaselineLine(baseline=list(map(tuple, base.coords)), - boundary=list(map(tuple, poly.exterior.coords)), - id=line.id, - tags={'type': 'default'})) - # write back - base = coordinates_for_segment(base.coords, None, page_coords) - line.set_Baseline(BaselineType(points=points_from_polygon(base))) - poly = coordinates_for_segment(poly.exterior.coords[:-1], None, page_coords) - line.set_Coords(CoordsType(points=points_from_polygon(poly))) + all_lines = page.get_AllTextLines() + # assumes that missing baselines are rare, if any + if any(line.Baseline for line in all_lines): + self.logger.info("Converting PAGE to Kraken Segmentation (baselines)") + segtype = 'baselines' + else: + self.logger.info("Converting PAGE to Kraken Segmentation (boxes only)") + segtype = 'bbox' + scale = 0.5 * np.median([xywh_from_points(line.Coords.points)['h'] for line in all_lines]) + self.logger.info("Estimated scale: %.1f", scale) + seglines = [] + for line in all_lines: + # FIXME: see whether model prefers baselines or bbox crops (seg_type) + # FIXME: even if we do not have baselines, emulating baseline+boundary might be useful to prevent automatic center normalization + poly = coordinates_of_segment(line, None, page_coords) + poly = make_valid(Polygon(poly)) + poly = poly.intersection(page_rect) + if segtype == 'baselines': + if line.Baseline is None: + base = dummy_baseline_of_segment(line, page_coords) else: - seglines.append(BBoxLine(bbox=poly.envelope.bounds, - id=line.id)) + base = baseline_of_segment(line, page_coords) + if len(base) < 2 or np.abs(np.mean(base[0] - base[-1])) <= 1: + base = dummy_baseline_of_segment(line, page_coords) + elif not LineString(base).intersects(poly): + base = dummy_baseline_of_segment(line, page_coords) + # kraken expects baseline to be fully contained in boundary + base = LineString(base) + if poly.is_empty: + poly = polygon_from_baseline(base, scale=scale) + elif not base.within(poly): + poly = join_polygons([poly, polygon_from_baseline(base, scale=scale)], + loc=line.id, scale=scale) + seglines.append(BaselineLine(baseline=list(map(tuple, base.coords)), + boundary=list(map(tuple, poly.exterior.coords)), + id=line.id, + tags={'type': 'default'})) + # write back + base = coordinates_for_segment(base.coords, None, page_coords) + line.set_Baseline(BaselineType(points=points_from_polygon(base))) + poly = coordinates_for_segment(poly.exterior.coords[:-1], None, page_coords) + line.set_Coords(CoordsType(points=points_from_polygon(poly))) + else: + seglines.append(BBoxLine(bbox=poly.envelope.bounds, + id=line.id)) - segmentation = Segmentation(lines=seglines, - script_detection=False, - text_direction='horizontal-lr', - type=segtype, - imagename=page_id) - for idx_line, ocr_record in enumerate(self.predict(page_image, segmentation)): - line = all_lines[idx_line] - id_line = line.id - if not ocr_record.prediction and not ocr_record.cuts: - log.warning('No results for line "%s"', line.id) + segmentation = Segmentation(lines=seglines, + script_detection=False, + text_direction='horizontal-lr', + type=segtype, + imagename=page_id) + for idx_line, ocr_record in enumerate(self.predict(page_image, segmentation)): + line = all_lines[idx_line] + id_line = line.id + if not ocr_record.prediction and not ocr_record.cuts: + self.logger.warning('No results for line "%s"', line.id) + continue + text_line = ocr_record.prediction + if len(ocr_record.confidences) > 0: + conf_line = sum(ocr_record.confidences) / len(ocr_record.confidences) + else: + conf_line = None + if self.parameter['overwrite_text']: + line.TextEquiv = [] + line.add_TextEquiv(TextEquivType(Unicode=text_line, conf=conf_line)) + idx_word = 0 + line_offset = 0 + for text_word in regex.splititer(r'(\s+)', text_line): + next_offset = line_offset + len(text_word) + cuts_word = list(map(list, ocr_record.cuts[line_offset:next_offset])) + # fixme: kraken#98 says the Pytorch CTC output is too impoverished to yield good glyph stops + # as a workaround, here we just steal from the next glyph start, respectively: + if len(ocr_record.cuts) > next_offset + 1: + cuts_word.extend(list(map(list, ocr_record.cuts[next_offset:next_offset+1]))) + else: + cuts_word.append(list(ocr_record.cuts[-1])) + confidences_word = ocr_record.confidences[line_offset:next_offset] + line_offset = next_offset + if len(text_word.strip()) == 0: continue - text_line = ocr_record.prediction - if len(ocr_record.confidences) > 0: - conf_line = sum(ocr_record.confidences) / len(ocr_record.confidences) + id_word = '%s_word_%s' % (id_line, idx_word + 1) + idx_word += 1 + poly_word = [point for cut in cuts_word for point in cut] + bbox_word = bbox_from_polygon(coordinates_for_segment(poly_word, None, page_coords)) + # avoid zero-size coords on ties + bbox_word = np.array(bbox_word, dtype=int) + if np.prod(bbox_word[2:4] - bbox_word[0:2]) == 0: + bbox_word[2:4] += 1 + if len(confidences_word) > 0: + conf_word = sum(confidences_word) / len(confidences_word) else: - conf_line = None - if self.parameter['overwrite_text']: - line.TextEquiv = [] - line.add_TextEquiv(TextEquivType(Unicode=text_line, conf=conf_line)) - idx_word = 0 - line_offset = 0 - for text_word in regex.splititer(r'(\s+)', text_line): - next_offset = line_offset + len(text_word) - cuts_word = list(map(list, ocr_record.cuts[line_offset:next_offset])) - # fixme: kraken#98 says the Pytorch CTC output is too impoverished to yield good glyph stops - # as a workaround, here we just steal from the next glyph start, respectively: - if len(ocr_record.cuts) > next_offset + 1: - cuts_word.extend(list(map(list, ocr_record.cuts[next_offset:next_offset+1]))) - else: - cuts_word.append(list(ocr_record.cuts[-1])) - confidences_word = ocr_record.confidences[line_offset:next_offset] - line_offset = next_offset - if len(text_word.strip()) == 0: - continue - id_word = '%s_word_%s' % (id_line, idx_word + 1) - idx_word += 1 - poly_word = [point for cut in cuts_word for point in cut] - bbox_word = bbox_from_polygon(coordinates_for_segment(poly_word, None, page_coords)) + conf_word = None + word = WordType(id=id_word, + Coords=CoordsType(points=points_from_bbox(*bbox_word))) + word.add_TextEquiv(TextEquivType(Unicode=text_word, conf=conf_word)) + for idx_glyph, text_glyph in enumerate(text_word): + id_glyph = '%s_glyph_%s' % (id_word, idx_glyph + 1) + poly_glyph = cuts_word[idx_glyph] + cuts_word[idx_glyph + 1] + bbox_glyph = bbox_from_polygon(coordinates_for_segment(poly_glyph, None, page_coords)) # avoid zero-size coords on ties - bbox_word = np.array(bbox_word, dtype=int) - if np.prod(bbox_word[2:4] - bbox_word[0:2]) == 0: - bbox_word[2:4] += 1 - if len(confidences_word) > 0: - conf_word = sum(confidences_word) / len(confidences_word) - else: - conf_word = None - word = WordType(id=id_word, - Coords=CoordsType(points=points_from_bbox(*bbox_word))) - word.add_TextEquiv(TextEquivType(Unicode=text_word, conf=conf_word)) - for idx_glyph, text_glyph in enumerate(text_word): - id_glyph = '%s_glyph_%s' % (id_word, idx_glyph + 1) - poly_glyph = cuts_word[idx_glyph] + cuts_word[idx_glyph + 1] - bbox_glyph = bbox_from_polygon(coordinates_for_segment(poly_glyph, None, page_coords)) - # avoid zero-size coords on ties - bbox_glyph = np.array(bbox_glyph, dtype=int) - if np.prod(bbox_glyph[2:4] - bbox_glyph[0:2]) == 0: - bbox_glyph[2:4] += 1 - conf_glyph = confidences_word[idx_glyph] - glyph = GlyphType(id=id_glyph, - Coords=CoordsType(points=points_from_bbox(*bbox_glyph))) - glyph.add_TextEquiv(TextEquivType(Unicode=text_glyph, conf=conf_glyph)) - word.add_Glyph(glyph) - line.add_Word(word) - log.info('Recognized line "%s"', line.id) + bbox_glyph = np.array(bbox_glyph, dtype=int) + if np.prod(bbox_glyph[2:4] - bbox_glyph[0:2]) == 0: + bbox_glyph[2:4] += 1 + conf_glyph = confidences_word[idx_glyph] + glyph = GlyphType(id=id_glyph, + Coords=CoordsType(points=points_from_bbox(*bbox_glyph))) + glyph.add_TextEquiv(TextEquivType(Unicode=text_glyph, conf=conf_glyph)) + word.add_Glyph(glyph) + line.add_Word(word) + self.logger.info('Recognized line "%s"', line.id) page_update_higher_textequiv_levels('line', pcgts) - log.info("Finished recognition, serializing") - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, f'{file_id}.xml'), - content=to_xml(pcgts)) + self.logger.info("Finished recognition, serializing") + return pcgts # zzz should go into core ocrd_utils def baseline_of_segment(segment, coords): @@ -251,7 +238,7 @@ def dummy_baseline_of_segment(segment, coords, yrel=0.2): return [[xmin, ymid], [xmax, ymid]] # zzz should go into core ocrd_utils -def polygon_from_baseline(baseline, scale=20): +def polygon_from_baseline(baseline, scale : Union[float, np.floating] = 20): if not isinstance(baseline, LineString): baseline = LineString(baseline) ltr = baseline.coords[0][0] < baseline.coords[-1][0] @@ -261,7 +248,7 @@ def polygon_from_baseline(baseline, scale=20): scale=scale)) return polygon -def join_polygons(polygons, loc='', scale=20): +def join_polygons(polygons, loc='', scale : Union[float, np.floating] = 20): """construct concave hull (alpha shape) from input polygons""" # compoundp = unary_union(polygons) # jointp = compoundp.convex_hull From 163ee7df929ba7be6126f4e9b3ec873c853f8a0e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:16:02 +0200 Subject: [PATCH 05/25] ocrd-tool.json: add cardinality specs --- ocrd_kraken/ocrd-tool.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ocrd_kraken/ocrd-tool.json b/ocrd_kraken/ocrd-tool.json index beac9ab..576aaf7 100644 --- a/ocrd_kraken/ocrd-tool.json +++ b/ocrd_kraken/ocrd-tool.json @@ -4,8 +4,8 @@ "tools": { "ocrd-kraken-binarize": { "executable": "ocrd-kraken-binarize", - "input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-SEG-REGION", "OCR-D-SEG-LINE"], - "output_file_grp": ["OCR-D-PRE-BIN"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "categories": [ "Image preprocessing" ], @@ -24,8 +24,8 @@ }, "ocrd-kraken-segment": { "executable": "ocrd-kraken-segment", - "input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-PRE-BIN"], - "output_file_grp": ["OCR-D-SEG-REGION", "OCR-D-SEG-LINE"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "categories": [ "Layout analysis" ], @@ -128,8 +128,8 @@ }, "ocrd-kraken-recognize": { "executable": "ocrd-kraken-recognize", - "input_file_grp": ["OCR-D-SEG-LINE"], - "output_file_grp": ["OCR-D-OCR-KRAK"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "categories": ["Text recognition and optimization"], "steps": ["recognition/text-recognition"], "description": "Text recognition with Kraken", From 41b0045d045f3bc3e430d41a38a22c18de8393ea Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:21:11 +0200 Subject: [PATCH 06/25] test_binarize.py: use stable API --- tests/test_binarize.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/test_binarize.py b/tests/test_binarize.py index 8f5c4b8..d455efd 100644 --- a/tests/test_binarize.py +++ b/tests/test_binarize.py @@ -6,7 +6,7 @@ from tests.base import assets, main -from ocrd import Resolver +from ocrd import Resolver, run_processor from ocrd_kraken.binarize import KrakenBinarize from ocrd_utils.logging import setOverrideLogLevel @@ -37,24 +37,24 @@ def workspace(tmpdir): # ) def test_binarize_regions(workspace): - proc = KrakenBinarize( - workspace, - input_file_grp="OCR-D-GT-PAGE", - output_file_grp="OCR-D-IMG-BIN-KRAKEN", - parameter={'level-of-operation': 'region'} + run_processor(KrakenBinarize, + workspace=workspace, + input_file_grp="OCR-D-GT-PAGE", + output_file_grp="OCR-D-IMG-BIN-KRAKEN", + parameter={'level-of-operation': 'region'} ) - proc.process() workspace.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) def test_binarize_lines(workspace): - proc = KrakenBinarize( - workspace, - input_file_grp="OCR-D-GT-PAGE", - output_file_grp="OCR-D-IMG-BIN-KRAKEN", - parameter={'level-of-operation': 'line'} + run_processor(KrakenBinarize, + workspace=workspace, + input_file_grp="OCR-D-GT-PAGE", + output_file_grp="OCR-D-IMG-BIN-KRAKEN", + parameter={'level-of-operation': 'line'} ) - proc.process() workspace.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) if __name__ == "__main__": main(__file__) From 340f51397b584da399214ae2db6a8c38450a7b43 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:22:15 +0200 Subject: [PATCH 07/25] test_recognize.py: use stable API --- tests/test_recognize.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_recognize.py b/tests/test_recognize.py index 0ae2850..1d5f924 100644 --- a/tests/test_recognize.py +++ b/tests/test_recognize.py @@ -20,13 +20,13 @@ def test_recognize(self): with pushd_popd(tempdir=True) as tempdir: workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir, download=True) workspace.overwrite_mode = True - proc = KrakenRecognize( - workspace, - input_file_grp="OCR-D-SEG-KRAKEN", - output_file_grp="OCR-D-OCR-KRAKEN", + run_processor(KrakenRecognize, + workspace=workspace, + input_file_grp="OCR-D-SEG-KRAKEN", + output_file_grp="OCR-D-OCR-KRAKEN", ) - proc.process() workspace.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) if __name__ == "__main__": main(__file__) From cd0ce01edf7a10701ddd1500c6144166b6a696f5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:24:13 +0200 Subject: [PATCH 08/25] test_segment.py: use stable API --- tests/test_segment.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/test_segment.py b/tests/test_segment.py index 627fbbf..9cb4360 100644 --- a/tests/test_segment.py +++ b/tests/test_segment.py @@ -5,7 +5,7 @@ from tests.base import TestCase, assets, main -from ocrd import Resolver +from ocrd import Resolver, run_processor from ocrd_utils import initLogging, pushd_popd from ocrd_kraken.segment import KrakenSegment @@ -18,42 +18,45 @@ def test_run_blla(self): resolver = Resolver() with pushd_popd(tempdir=True) as tempdir: workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir, download=True) - proc = KrakenSegment( - workspace, + run_processor( + KrakenSegment, + workspace=workspace, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", parameter={'maxcolseps': 0, 'use_legacy': False} ) - proc.process() workspace.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) def test_run_blla_regionlevel(self): resolver = Resolver() with pushd_popd(tempdir=True) as tempdir: workspace = resolver.workspace_from_url(assets.path_to('kant_aufklaerung_1784-page-region/data/mets.xml'), dst_dir=tempdir, download=True) - proc = KrakenSegment( - workspace, + run_processor( + KrakenSegment, + workspace=workspace, input_file_grp="OCR-D-GT-SEG-REGION", output_file_grp="OCR-D-SEG-LINE-KRAKEN", page_id="phys_0005", parameter={'maxcolseps': 0, 'use_legacy': False} ) - proc.process() workspace.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) def test_run_legacy(self): resolver = Resolver() # with pushd_popd('/tmp/kraken-test') as tempdir: with pushd_popd(tempdir=True) as tempdir: workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir, download=True) - proc = KrakenSegment( - workspace, + run_processor( + KrakenSegment, + workspace=workspace, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", parameter={'maxcolseps': 0, 'use_legacy': True} ) - proc.process() workspace.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) if __name__ == "__main__": main(__file__) From 4671e982e51f8b9924c75d6632d4bea86f0b4cea Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:25:38 +0200 Subject: [PATCH 09/25] remove fileGrp cardinality assertions --- ocrd_kraken/binarize.py | 5 ----- ocrd_kraken/recognize.py | 4 +--- ocrd_kraken/segment.py | 2 -- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py index 809a919..c4379bc 100644 --- a/ocrd_kraken/binarize.py +++ b/ocrd_kraken/binarize.py @@ -17,11 +17,6 @@ class KrakenBinarize(Processor): def executable(self): return 'ocrd-kraken-binarize' - def setup(self): - self.logger = getLogger('processor.KrakenBinarize') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: """Binarize the pages/regions/lines with Kraken. diff --git a/ocrd_kraken/recognize.py b/ocrd_kraken/recognize.py index a0773d1..dea439a 100644 --- a/ocrd_kraken/recognize.py +++ b/ocrd_kraken/recognize.py @@ -54,10 +54,8 @@ def executable(self): def setup(self): """ - Assert filegrp cardinality, load model, set predict function + Load model, set predict function """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) self.logger = getLogger('processor.KrakenRecognize') import torch diff --git a/ocrd_kraken/segment.py b/ocrd_kraken/segment.py index bdce58f..5b8fe4b 100644 --- a/ocrd_kraken/segment.py +++ b/ocrd_kraken/segment.py @@ -45,8 +45,6 @@ def setup(self): Load models """ self.logger = getLogger('processor.KrakenSegment') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) kwargs = {} kwargs['text_direction'] = self.parameter['text_direction'] self.use_legacy = self.parameter['use_legacy'] From a497287f949200f8eb9f84f5cb69843e44e2c05f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 08:12:03 +0200 Subject: [PATCH 10/25] binarize: re-instate setup for logger --- ocrd_kraken/binarize.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py index c4379bc..5104a87 100644 --- a/ocrd_kraken/binarize.py +++ b/ocrd_kraken/binarize.py @@ -17,6 +17,9 @@ class KrakenBinarize(Processor): def executable(self): return 'ocrd-kraken-binarize' + def setup(self): + self.logger = getLogger('processor.KrakenBinarize') + def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: """Binarize the pages/regions/lines with Kraken. From c0c1eb7f5ca2e4248f1a824f3428fb566ed5cf28 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 19:49:26 +0200 Subject: [PATCH 11/25] adapt to bertsky/core#8 --- ocrd_kraken/binarize.py | 14 ++++++++------ ocrd_kraken/recognize.py | 7 ++++--- ocrd_kraken/segment.py | 6 ++++-- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py index 809a919..08038c9 100644 --- a/ocrd_kraken/binarize.py +++ b/ocrd_kraken/binarize.py @@ -2,6 +2,8 @@ import os from os.path import join from typing import Optional + +from ocrd_models.ocrd_process_result import OcrdProcessResult import kraken.binarization from ocrd import Processor from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id, MIMETYPE_PAGE @@ -22,7 +24,7 @@ def setup(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: """Binarize the pages/regions/lines with Kraken. Iterate over the input PAGE element hierarchy down to the requested @@ -50,14 +52,14 @@ def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[ assert page page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id, feature_filter='binarized') - ret = [pcgts] + images = [] if self.parameter['level-of-operation'] == 'page': self.logger.info("Binarizing page '%s'", page_id) bin_image = kraken.binarization.nlbin(page_image) bin_image_id = f'{output_file_id}.IMG-BIN' bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{page_xywh["features"]},binarized')) - ret.append((bin_image, bin_image_id, bin_image_path)) + images.append((bin_image, bin_image_id, bin_image_path)) else: for region in page.get_AllRegions(classes=['Text']): region_image, region_xywh = self.workspace.image_from_segment( @@ -68,7 +70,7 @@ def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[ bin_image_id = f'{output_file_id}_{region.id}.IMG-BIN' bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{region_xywh["features"]},binarized')) - ret.append((bin_image, bin_image_id, bin_image_path)) + images.append((bin_image, bin_image_id, bin_image_path)) else: for line in region.get_TextLine(): line_image, line_xywh = self.workspace.image_from_segment( @@ -78,5 +80,5 @@ def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[ bin_image_id = f'{output_file_id}_{region.id}_{line.id}.IMG-BIN' bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{page_xywh["features"]},binarized')) - ret.append((bin_image, bin_image_id, bin_image_path)) - return ret + images.append((bin_image, bin_image_id, bin_image_path)) + return OcrdProcessResult(pcgts, images) diff --git a/ocrd_kraken/recognize.py b/ocrd_kraken/recognize.py index a0773d1..8635551 100644 --- a/ocrd_kraken/recognize.py +++ b/ocrd_kraken/recognize.py @@ -1,5 +1,6 @@ from os.path import join -from typing import Union +from typing import Optional, Union +from ocrd_models import OcrdProcessResult import regex import itertools import numpy as np @@ -77,7 +78,7 @@ def predict(page_image, segmentation): self.parameter['bidi_reordering']) self.predict = predict - def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: """Recognize text on lines with Kraken. Open the parsed PAGE-XML file, then iterate over the element hierarchy @@ -223,7 +224,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page_update_higher_textequiv_levels('line', pcgts) self.logger.info("Finished recognition, serializing") - return pcgts + return OcrdProcessResult(pcgts) # zzz should go into core ocrd_utils def baseline_of_segment(segment, coords): diff --git a/ocrd_kraken/segment.py b/ocrd_kraken/segment.py index bdce58f..3b29926 100644 --- a/ocrd_kraken/segment.py +++ b/ocrd_kraken/segment.py @@ -3,6 +3,7 @@ from os.path import join from ocrd import Processor +from ocrd_models import OcrdProcessResult from ocrd_utils import ( getLogger, assert_file_grp_cardinality, @@ -72,7 +73,7 @@ def segmenter(img, mask=None): return segment(img, mask=mask, **kwargs) self.segmenter = segmenter - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: """Segment into (regions and) lines with Kraken. Iterate over the element hierarchy of the PAGE-XML down to the @@ -100,6 +101,7 @@ def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[ pcgts = input_pcgts[0] page = pcgts.get_Page() + assert page page_image, page_coords, page_info = self.workspace.image_from_page( page, page_id, feature_selector="binarized" if self.use_legacy else "") @@ -142,7 +144,7 @@ def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[ self.logger.warning('Keeping %d lines in region "%s"', len(region.TextLine or []), region.id) self._process_region(page_image, page_coords, region, zoom) - return pcgts + return OcrdProcessResult(pcgts) def _process_page(self, page_image, page_coords, page, zoom=1.0): def getmask(): From e8ec7fe6d364942967b0c0f4fc1ef1d75a6826f9 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 14:24:00 +0200 Subject: [PATCH 12/25] require regex --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 8c70309..5639e86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ ocrd >= 3.0.0a1 kraken >= 5.0 scipy shapely +regex From e76d708e5a4956aa5f06c3efb2e760e4d6b06f09 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 14:24:14 +0200 Subject: [PATCH 13/25] update to OcrdPageResult change --- ocrd_kraken/binarize.py | 17 +++++++++-------- ocrd_kraken/recognize.py | 14 +++----------- ocrd_kraken/segment.py | 15 +++------------ 3 files changed, 15 insertions(+), 31 deletions(-) diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py index c367657..b430b66 100644 --- a/ocrd_kraken/binarize.py +++ b/ocrd_kraken/binarize.py @@ -1,9 +1,10 @@ from __future__ import absolute_import -import os from os.path import join from typing import Optional -from ocrd_models.ocrd_process_result import OcrdProcessResult +from ocrd.processor.base import OcrdPageResult +from ocrd.processor.ocrd_page_result import OcrdPageResultImage + import kraken.binarization from ocrd import Processor from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id, MIMETYPE_PAGE @@ -22,7 +23,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.KrakenBinarize') - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: """Binarize the pages/regions/lines with Kraken. Iterate over the input PAGE element hierarchy down to the requested @@ -50,14 +51,14 @@ def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[st assert page page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id, feature_filter='binarized') - images = [] + result = OcrdPageResult(pcgts) if self.parameter['level-of-operation'] == 'page': self.logger.info("Binarizing page '%s'", page_id) bin_image = kraken.binarization.nlbin(page_image) bin_image_id = f'{output_file_id}.IMG-BIN' bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{page_xywh["features"]},binarized')) - images.append((bin_image, bin_image_id, bin_image_path)) + result.images.append(OcrdPageResultImage(bin_image, bin_image_id, bin_image_path)) else: for region in page.get_AllRegions(classes=['Text']): region_image, region_xywh = self.workspace.image_from_segment( @@ -68,7 +69,7 @@ def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[st bin_image_id = f'{output_file_id}_{region.id}.IMG-BIN' bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{region_xywh["features"]},binarized')) - images.append((bin_image, bin_image_id, bin_image_path)) + result.images.append(OcrdPageResultImage(bin_image, bin_image_id, bin_image_path)) else: for line in region.get_TextLine(): line_image, line_xywh = self.workspace.image_from_segment( @@ -78,5 +79,5 @@ def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[st bin_image_id = f'{output_file_id}_{region.id}_{line.id}.IMG-BIN' bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{page_xywh["features"]},binarized')) - images.append((bin_image, bin_image_id, bin_image_path)) - return OcrdProcessResult(pcgts, images) + result.images.append(OcrdPageResultImage(bin_image, bin_image_id, bin_image_path)) + return result diff --git a/ocrd_kraken/recognize.py b/ocrd_kraken/recognize.py index e285c75..dd032da 100644 --- a/ocrd_kraken/recognize.py +++ b/ocrd_kraken/recognize.py @@ -1,6 +1,5 @@ -from os.path import join from typing import Optional, Union -from ocrd_models import OcrdProcessResult +from ocrd.processor.base import OcrdPageResult import regex import itertools import numpy as np @@ -11,8 +10,6 @@ from ocrd import Processor from ocrd_utils import ( getLogger, - make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, coordinates_for_segment, bbox_from_polygon, @@ -20,11 +17,8 @@ points_from_bbox, polygon_from_points, xywh_from_points, - bbox_from_points, transform_coordinates, - MIMETYPE_PAGE, ) -from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( OcrdPage, RegionRefType, @@ -45,8 +39,6 @@ TextLineOrderSimpleType ) -from ocrd_kraken.config import OCRD_TOOL - class KrakenRecognize(Processor): @property @@ -76,7 +68,7 @@ def predict(page_image, segmentation): self.parameter['bidi_reordering']) self.predict = predict - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: """Recognize text on lines with Kraken. Open the parsed PAGE-XML file, then iterate over the element hierarchy @@ -222,7 +214,7 @@ def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[st page_update_higher_textequiv_levels('line', pcgts) self.logger.info("Finished recognition, serializing") - return OcrdProcessResult(pcgts) + return OcrdPageResult(pcgts) # zzz should go into core ocrd_utils def baseline_of_segment(segment, coords): diff --git a/ocrd_kraken/segment.py b/ocrd_kraken/segment.py index 2f09823..94887f3 100644 --- a/ocrd_kraken/segment.py +++ b/ocrd_kraken/segment.py @@ -1,20 +1,15 @@ from typing import Optional from PIL import ImageOps -from os.path import join from ocrd import Processor -from ocrd_models import OcrdProcessResult +from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_utils import ( getLogger, - assert_file_grp_cardinality, - make_file_id, - concat_padded, polygon_from_x0y0x1y1, points_from_polygon, polygon_mask, coordinates_for_segment, coordinates_of_segment, - MIMETYPE_PAGE ) import ocrd_models.ocrd_page from ocrd_models.ocrd_page import ( @@ -25,16 +20,12 @@ TextLineType, CoordsType, BaselineType, - to_xml ) -from ocrd_modelfactory import page_from_file import shapely.geometry as geom from shapely.prepared import prep as geom_prep import torch -from .config import OCRD_TOOL - class KrakenSegment(Processor): @property @@ -71,7 +62,7 @@ def segmenter(img, mask=None): return segment(img, mask=mask, **kwargs) self.segmenter = segmenter - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: """Segment into (regions and) lines with Kraken. Iterate over the element hierarchy of the PAGE-XML down to the @@ -142,7 +133,7 @@ def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[st self.logger.warning('Keeping %d lines in region "%s"', len(region.TextLine or []), region.id) self._process_region(page_image, page_coords, region, zoom) - return OcrdProcessResult(pcgts) + return OcrdPageResult(pcgts) def _process_page(self, page_image, page_coords, page, zoom=1.0): def getmask(): From 283272200f3904ace382ee50c86a97956333f691 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 15:23:08 +0200 Subject: [PATCH 14/25] update to latest OcrdPageResult and process_page_pcgts --- ocrd_kraken/binarize.py | 27 +++++++++++---------------- ocrd_kraken/recognize.py | 4 +++- ocrd_kraken/segment.py | 3 ++- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py index b430b66..ac7c4a9 100644 --- a/ocrd_kraken/binarize.py +++ b/ocrd_kraken/binarize.py @@ -23,7 +23,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.KrakenBinarize') - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Binarize the pages/regions/lines with Kraken. Iterate over the input PAGE element hierarchy down to the requested @@ -47,6 +47,7 @@ def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[st self.logger.debug('Level of operation: "%s"', self.parameter['level-of-operation']) pcgts = input_pcgts[0] + assert pcgts page = pcgts.get_Page() assert page page_image, page_xywh, _ = self.workspace.image_from_page( @@ -54,30 +55,24 @@ def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[st result = OcrdPageResult(pcgts) if self.parameter['level-of-operation'] == 'page': self.logger.info("Binarizing page '%s'", page_id) - bin_image = kraken.binarization.nlbin(page_image) - bin_image_id = f'{output_file_id}.IMG-BIN' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') - page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{page_xywh["features"]},binarized')) - result.images.append(OcrdPageResultImage(bin_image, bin_image_id, bin_image_path)) + alternative_image = AlternativeImageType(comments=f'{page_xywh["features"]},binarized') + page.add_AlternativeImage(alternative_image) + result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(page_image), '.IMG-BIN', alternative_image)) else: for region in page.get_AllRegions(classes=['Text']): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') if self.parameter['level-of-operation'] == 'region': self.logger.info("Binarizing region '%s'", region.id) - bin_image = kraken.binarization.nlbin(region_image) - bin_image_id = f'{output_file_id}_{region.id}.IMG-BIN' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') - region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{region_xywh["features"]},binarized')) - result.images.append(OcrdPageResultImage(bin_image, bin_image_id, bin_image_path)) + alternative_image = AlternativeImageType(comments=f'{region_xywh["features"]},binarized') + region.add_AlternativeImage(alternative_image) + result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(region_image), f'{region.id}.IMG-BIN', alternative_image)) else: for line in region.get_TextLine(): line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_filter='binarized') self.logger.info("Binarizing line '%s'", line.id) - bin_image = kraken.binarization.nlbin(line_image) - bin_image_id = f'{output_file_id}_{region.id}_{line.id}.IMG-BIN' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') - line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=f'{page_xywh["features"]},binarized')) - result.images.append(OcrdPageResultImage(bin_image, bin_image_id, bin_image_path)) + alternative_image = AlternativeImageType(comments=f'{line_xywh["features"]},binarized') + line.add_AlternativeImage(alternative_image) + result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(line_image), f'{region.id}_{line.id}.IMG-BIN', alternative_image)) return result diff --git a/ocrd_kraken/recognize.py b/ocrd_kraken/recognize.py index dd032da..01c8079 100644 --- a/ocrd_kraken/recognize.py +++ b/ocrd_kraken/recognize.py @@ -68,7 +68,7 @@ def predict(page_image, segmentation): self.parameter['bidi_reordering']) self.predict = predict - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Recognize text on lines with Kraken. Open the parsed PAGE-XML file, then iterate over the element hierarchy @@ -92,7 +92,9 @@ def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[st from kraken.containers import Segmentation, BaselineLine, BBoxLine pcgts = input_pcgts[0] + assert pcgts page = pcgts.get_Page() + assert page page_image, page_coords, _ = self.workspace.image_from_page( page, page_id, feature_selector="binarized" diff --git a/ocrd_kraken/segment.py b/ocrd_kraken/segment.py index 94887f3..806b413 100644 --- a/ocrd_kraken/segment.py +++ b/ocrd_kraken/segment.py @@ -62,7 +62,7 @@ def segmenter(img, mask=None): return segment(img, mask=mask, **kwargs) self.segmenter = segmenter - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Segment into (regions and) lines with Kraken. Iterate over the element hierarchy of the PAGE-XML down to the @@ -89,6 +89,7 @@ def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[st """ pcgts = input_pcgts[0] + assert pcgts page = pcgts.get_Page() assert page page_image, page_coords, page_info = self.workspace.image_from_page( From a8a859bf7ee52eadfc6aec0e5e03a84eb1f8945c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:20:44 +0200 Subject: [PATCH 15/25] CI: switch back to Ubuntu (after MacOS fails with `torch ... not supported on this platform` :roll_eyes: ) --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2c420e1..ed91176 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,8 @@ jobs: # # Related issue: https://github.com/actions/runner-images/issues/672. # runs-on: ubuntu-latest - runs-on: macos-latest + # runs-on: macos-latest + runs-on: ubuntu-latest strategy: fail-fast: false matrix: From 0e3013835502fc78cd0ff78a6d1be314151396bc Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 14:53:33 +0200 Subject: [PATCH 16/25] self.logger: adapt to bertsky/core#10 --- ocrd_kraken/binarize.py | 3 --- ocrd_kraken/recognize.py | 1 - ocrd_kraken/segment.py | 1 - 3 files changed, 5 deletions(-) diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py index ac7c4a9..d8da6a4 100644 --- a/ocrd_kraken/binarize.py +++ b/ocrd_kraken/binarize.py @@ -20,9 +20,6 @@ class KrakenBinarize(Processor): def executable(self): return 'ocrd-kraken-binarize' - def setup(self): - self.logger = getLogger('processor.KrakenBinarize') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Binarize the pages/regions/lines with Kraken. diff --git a/ocrd_kraken/recognize.py b/ocrd_kraken/recognize.py index 01c8079..4e97504 100644 --- a/ocrd_kraken/recognize.py +++ b/ocrd_kraken/recognize.py @@ -50,7 +50,6 @@ def setup(self): Load model, set predict function """ - self.logger = getLogger('processor.KrakenRecognize') import torch from kraken.rpred import rpred from kraken.lib.models import load_any diff --git a/ocrd_kraken/segment.py b/ocrd_kraken/segment.py index 806b413..774328f 100644 --- a/ocrd_kraken/segment.py +++ b/ocrd_kraken/segment.py @@ -36,7 +36,6 @@ def setup(self): """ Load models """ - self.logger = getLogger('processor.KrakenSegment') kwargs = {} kwargs['text_direction'] = self.parameter['text_direction'] self.use_legacy = self.parameter['use_legacy'] From 6d287b05cee0828c55bef91f871335e6e1cfda71 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 01:50:39 +0200 Subject: [PATCH 17/25] =?UTF-8?q?tests:=20migrate=20unittest=E2=86=92pytes?= =?UTF-8?q?t,=20simplify?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/base.py | 111 +++++++++++----------------------------- tests/test_binarize.py | 64 +++++++++-------------- tests/test_recognize.py | 41 +++++---------- tests/test_segment.py | 92 +++++++++++++-------------------- 4 files changed, 101 insertions(+), 207 deletions(-) diff --git a/tests/base.py b/tests/base.py index 1387769..9e6cebc 100644 --- a/tests/base.py +++ b/tests/base.py @@ -1,89 +1,36 @@ # pylint: disable=unused-import -from os.path import dirname, realpath -from os import chdir -import sys -import logging -import io -import collections -from unittest import TestCase as VanillaTestCase, skip, main as unittests_main +from multiprocessing import Process +from time import sleep import pytest -from ocrd_utils import disableLogging, initLogging -from tests.assets import assets, copy_of_directory +from ocrd import Resolver, Workspace, OcrdMetsServer +from ocrd_utils import pushd_popd, disableLogging, initLogging, setOverrideLogLevel, config +from tests.assets import assets -def main(fn=None): - if fn: - sys.exit(pytest.main([fn])) - else: - unittests_main() - - -class TestCase(VanillaTestCase): - - @classmethod - def setUpClass(cls): - chdir(dirname(realpath(__file__)) + '/..') - - def setUp(self): - disableLogging() +@pytest.fixture +def workspace(tmpdir, pytestconfig): + def _make_workspace(workspace_path): initLogging() - -class CapturingTestCase(TestCase): - """ - A TestCase that needs to capture stderr/stdout and invoke click CLI. - """ - - @pytest.fixture(autouse=True) - def _setup_pytest_capfd(self, capfd): - self.capfd = capfd - - def invoke_cli(self, cli, args): - """ - Substitution for click.CliRunner.invooke that works together nicely - with unittests/pytest capturing stdout/stderr. - """ - self.capture_out_err() # XXX snapshot just before executing the CLI - code = 0 - sys.argv[1:] = args # XXX necessary because sys.argv reflects pytest args not cli args - try: - cli.main(args=args) - except SystemExit as e: - code = e.code - out, err = self.capture_out_err() - return code, out, err - - def capture_out_err(self): - return self.capfd.readouterr() - -# import traceback -# import warnings -# def warn_with_traceback(message, category, filename, lineno, file=None, line=None): -# log = file if hasattr(file, 'write') else sys.stderr -# traceback.print_stack(file=log) -# log.write(warnings.formatwarning(message, category, filename, lineno, line)) -# warnings.showwarning = warn_with_traceback - -# https://stackoverflow.com/questions/37944111/python-rolling-log-to-a-variable -# Adapted from http://alanwsmith.com/capturing-python-log-output-in-a-variable - -class FIFOIO(io.TextIOBase): - def __init__(self, size, *args): - self.maxsize = size - io.TextIOBase.__init__(self, *args) - self.deque = collections.deque() - def getvalue(self): - return ''.join(self.deque) - def write(self, x): - self.deque.append(x) - self.shrink() - def shrink(self): - if self.maxsize is None: - return - size = sum(len(x) for x in self.deque) - while size > self.maxsize: - x = self.deque.popleft() - size -= len(x) - -sys.path.append(dirname(realpath(__file__)) + '/../ocrd') + if pytestconfig.getoption('verbose') > 0: + setOverrideLogLevel('DEBUG') + with pushd_popd(tmpdir): + yield Resolver().workspace_from_url(workspace_path, dst_dir=tmpdir, download=True) + return _make_workspace + +@pytest.fixture +def workspace_manifesto(workspace): + yield from workspace(assets.path_to('communist_manifesto/data/mets.xml')) + +@pytest.fixture +def workspace_aufklaerung(workspace): + yield from workspace(assets.path_to('kant_aufklaerung_1784/data/mets.xml')) + +@pytest.fixture +def workspace_aufklaerung_region(workspace): + yield from workspace(assets.path_to('kant_aufklaerung_1784-page-region/data/mets.xml')) + +@pytest.fixture +def workspace_sbb(workspace): + yield from workspace(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')) diff --git a/tests/test_binarize.py b/tests/test_binarize.py index d455efd..3d5816c 100644 --- a/tests/test_binarize.py +++ b/tests/test_binarize.py @@ -1,60 +1,42 @@ # pylint: disable=import-error -import os -import shutil -import pytest +import json -from tests.base import assets, main +from tests.base import * -from ocrd import Resolver, run_processor +from ocrd import run_processor from ocrd_kraken.binarize import KrakenBinarize -from ocrd_utils.logging import setOverrideLogLevel - -setOverrideLogLevel('DEBUG') PARAM_JSON = assets.url_of('param-binarize.json') - -@pytest.fixture() -def workspace(tmpdir): - if os.path.exists(tmpdir): - shutil.rmtree(tmpdir) - workspace = Resolver().workspace_from_url( - assets.path_to('kant_aufklaerung_1784/data/mets.xml'), - dst_dir=tmpdir, - download=True +def test_param_json(workspace_sbb): + ws = workspace_sbb + run_processor(KrakenBinarize, + workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-BIN-KRAKEN", + parameter=json.load(open(PARAM_JSON)), ) - return workspace + ws.save_mets() - -# def test_param_json(self): -# workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), dst_dir=WORKSPACE_DIR) -# run_processor( -# KrakenBinarize, -# resolver=resolver, -# workspace=workspace, -# parameter=PARAM_JSON -# ) - -def test_binarize_regions(workspace): +def test_binarize_regions(workspace_aufklaerung): + ws = workspace_aufklaerung run_processor(KrakenBinarize, - workspace=workspace, + workspace=ws, input_file_grp="OCR-D-GT-PAGE", - output_file_grp="OCR-D-IMG-BIN-KRAKEN", - parameter={'level-of-operation': 'region'} + output_file_grp="OCR-D-BIN-KRAKEN", + parameter={'level-of-operation': 'region'}, ) - workspace.save_mets() + ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) -def test_binarize_lines(workspace): +def test_binarize_lines(workspace_aufklaerung): + ws = workspace_aufklaerung run_processor(KrakenBinarize, - workspace=workspace, + workspace=ws, input_file_grp="OCR-D-GT-PAGE", - output_file_grp="OCR-D-IMG-BIN-KRAKEN", - parameter={'level-of-operation': 'line'} + output_file_grp="OCR-D-BIN-KRAKEN", + parameter={'level-of-operation': 'line'}, ) - workspace.save_mets() + ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) - -if __name__ == "__main__": - main(__file__) diff --git a/tests/test_recognize.py b/tests/test_recognize.py index 1d5f924..1b31cbd 100644 --- a/tests/test_recognize.py +++ b/tests/test_recognize.py @@ -1,32 +1,19 @@ # pylint: disable=import-error -import os -import shutil +from tests.base import * -from tests.base import TestCase, assets, main - -from ocrd import Resolver, run_processor -from ocrd_utils import initLogging, pushd_popd +from ocrd import run_processor from ocrd_kraken.recognize import KrakenRecognize -class TestKrakenRecognize(TestCase): - - def setUp(self): - initLogging() - - def test_recognize(self): - resolver = Resolver() - # with pushd_popd('/tmp/kraken-test') as tempdir: - with pushd_popd(tempdir=True) as tempdir: - workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir, download=True) - workspace.overwrite_mode = True - run_processor(KrakenRecognize, - workspace=workspace, - input_file_grp="OCR-D-SEG-KRAKEN", - output_file_grp="OCR-D-OCR-KRAKEN", - ) - workspace.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) - -if __name__ == "__main__": - main(__file__) +def test_recognize(workspace_manifesto): + ws = workspace_manifesto + print(ws) + print(type(ws)) + print(ws.directory) + run_processor(KrakenRecognize, + workspace=ws, + input_file_grp="OCR-D-SEG-KRAKEN", + output_file_grp="OCR-D-OCR-KRAKEN", + ) + ws.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) diff --git a/tests/test_segment.py b/tests/test_segment.py index 9cb4360..7dc78ba 100644 --- a/tests/test_segment.py +++ b/tests/test_segment.py @@ -1,62 +1,40 @@ # pylint: disable=import-error -import os -import shutil +from tests.base import * -from tests.base import TestCase, assets, main - -from ocrd import Resolver, run_processor -from ocrd_utils import initLogging, pushd_popd +from ocrd import run_processor from ocrd_kraken.segment import KrakenSegment -class TestKrakenSegment(TestCase): - - def setUp(self): - initLogging() - - def test_run_blla(self): - resolver = Resolver() - with pushd_popd(tempdir=True) as tempdir: - workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir, download=True) - run_processor( - KrakenSegment, - workspace=workspace, - input_file_grp="OCR-D-IMG-BIN", - output_file_grp="OCR-D-SEG-LINE-KRAKEN", - parameter={'maxcolseps': 0, 'use_legacy': False} - ) - workspace.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) - - def test_run_blla_regionlevel(self): - resolver = Resolver() - with pushd_popd(tempdir=True) as tempdir: - workspace = resolver.workspace_from_url(assets.path_to('kant_aufklaerung_1784-page-region/data/mets.xml'), dst_dir=tempdir, download=True) - run_processor( - KrakenSegment, - workspace=workspace, - input_file_grp="OCR-D-GT-SEG-REGION", - output_file_grp="OCR-D-SEG-LINE-KRAKEN", - page_id="phys_0005", - parameter={'maxcolseps': 0, 'use_legacy': False} - ) - workspace.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) - - def test_run_legacy(self): - resolver = Resolver() - # with pushd_popd('/tmp/kraken-test') as tempdir: - with pushd_popd(tempdir=True) as tempdir: - workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir, download=True) - run_processor( - KrakenSegment, - workspace=workspace, - input_file_grp="OCR-D-IMG-BIN", - output_file_grp="OCR-D-SEG-LINE-KRAKEN", - parameter={'maxcolseps': 0, 'use_legacy': True} - ) - workspace.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) - -if __name__ == "__main__": - main(__file__) +def test_run_blla(workspace_manifesto): + ws = workspace_manifesto + run_processor(KrakenSegment, + workspace=ws, + input_file_grp="OCR-D-IMG-BIN", + output_file_grp="OCR-D-SEG-LINE-KRAKEN", + parameter={'maxcolseps': 0, 'use_legacy': False} + ) + ws.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) + +def test_run_blla_regionlevel(workspace_aufklaerung_region): + ws = workspace_aufklaerung_region + run_processor(KrakenSegment, + workspace=ws, + input_file_grp="OCR-D-GT-SEG-REGION", + output_file_grp="OCR-D-SEG-LINE-KRAKEN", + page_id="phys_0005", + parameter={'maxcolseps': 0, 'use_legacy': False} + ) + ws.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) + +def test_run_legacy(workspace_manifesto): + ws = workspace_manifesto + run_processor(KrakenSegment, + workspace=ws, + input_file_grp="OCR-D-IMG-BIN", + output_file_grp="OCR-D-SEG-LINE-KRAKEN", + parameter={'maxcolseps': 0, 'use_legacy': True} + ) + ws.save_mets() + # FIXME: add result assertions (find_files, parsing PAGE etc) From 316eedb08893c8e1d19fd4f1aa1503df7d68deb3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 01:59:31 +0200 Subject: [PATCH 18/25] =?UTF-8?q?tests:=20base=E2=86=92conftest?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/{base.py => conftest.py} | 3 ++- tests/test_binarize.py | 2 +- tests/test_recognize.py | 5 ----- tests/test_segment.py | 2 -- 4 files changed, 3 insertions(+), 9 deletions(-) rename tests/{base.py => conftest.py} (97%) diff --git a/tests/base.py b/tests/conftest.py similarity index 97% rename from tests/base.py rename to tests/conftest.py index 9e6cebc..3b8cd32 100644 --- a/tests/base.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ from ocrd import Resolver, Workspace, OcrdMetsServer from ocrd_utils import pushd_popd, disableLogging, initLogging, setOverrideLogLevel, config -from tests.assets import assets +from .assets import assets @pytest.fixture def workspace(tmpdir, pytestconfig): @@ -34,3 +34,4 @@ def workspace_aufklaerung_region(workspace): @pytest.fixture def workspace_sbb(workspace): yield from workspace(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')) + diff --git a/tests/test_binarize.py b/tests/test_binarize.py index 3d5816c..4396aec 100644 --- a/tests/test_binarize.py +++ b/tests/test_binarize.py @@ -2,7 +2,7 @@ import json -from tests.base import * +from .assets import assets from ocrd import run_processor from ocrd_kraken.binarize import KrakenBinarize diff --git a/tests/test_recognize.py b/tests/test_recognize.py index 1b31cbd..290d7d0 100644 --- a/tests/test_recognize.py +++ b/tests/test_recognize.py @@ -1,15 +1,10 @@ # pylint: disable=import-error -from tests.base import * - from ocrd import run_processor from ocrd_kraken.recognize import KrakenRecognize def test_recognize(workspace_manifesto): ws = workspace_manifesto - print(ws) - print(type(ws)) - print(ws.directory) run_processor(KrakenRecognize, workspace=ws, input_file_grp="OCR-D-SEG-KRAKEN", diff --git a/tests/test_segment.py b/tests/test_segment.py index 7dc78ba..27dc2d7 100644 --- a/tests/test_segment.py +++ b/tests/test_segment.py @@ -1,7 +1,5 @@ # pylint: disable=import-error -from tests.base import * - from ocrd import run_processor from ocrd_kraken.segment import KrakenSegment From 43c600f904a2cbffe79339ff4208e64ff6640bdd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 12:46:31 +0200 Subject: [PATCH 19/25] tests: also w/ METS server + page-parallel and w/ METS caching --- Makefile | 2 +- tests/conftest.py | 32 ++++++++++++++++++++++++++++---- tests/test_binarize.py | 17 +++++++++-------- tests/test_recognize.py | 5 +++-- tests/test_segment.py | 19 ++++++++++--------- 5 files changed, 51 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 5acb886..934202c 100644 --- a/Makefile +++ b/Makefile @@ -68,7 +68,7 @@ docker: # Run test test: tests/assets - $(PYTHON) -m pytest tests $(PYTEST_ARGS) + $(PYTHON) -m pytest tests --durations=0 $(PYTEST_ARGS) # # Assets diff --git a/tests/conftest.py b/tests/conftest.py index 3b8cd32..2f62241 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,16 +9,41 @@ from .assets import assets -@pytest.fixture -def workspace(tmpdir, pytestconfig): +CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] + +@pytest.fixture(params=CONFIGS) +def workspace(tmpdir, pytestconfig, request): def _make_workspace(workspace_path): initLogging() if pytestconfig.getoption('verbose') > 0: setOverrideLogLevel('DEBUG') with pushd_popd(tmpdir): - yield Resolver().workspace_from_url(workspace_path, dst_dir=tmpdir, download=True) + directory = str(tmpdir) + resolver = Resolver() + workspace = resolver.workspace_from_url(workspace_path, dst_dir=directory, download=True) + if 'metscache' in request.param: + config.OCRD_METS_CACHING = True + print("enabled METS caching") + if 'pageparallel' in request.param: + config.OCRD_MAX_PARALLEL_PAGES = 4 + print("enabled page-parallel processing") + def _start_mets_server(*args, **kwargs): + print("running with METS server") + server = OcrdMetsServer(*args, **kwargs) + server.startup() + process = Process(target=_start_mets_server, + kwargs={'workspace': workspace, 'url': 'mets.sock'}) + process.start() + sleep(1) + workspace = Workspace(resolver, directory, mets_server_url='mets.sock') + yield {'workspace': workspace, 'mets_server_url': 'mets.sock'} + process.terminate() + else: + yield {'workspace': workspace} + config.reset_defaults() return _make_workspace + @pytest.fixture def workspace_manifesto(workspace): yield from workspace(assets.path_to('communist_manifesto/data/mets.xml')) @@ -34,4 +59,3 @@ def workspace_aufklaerung_region(workspace): @pytest.fixture def workspace_sbb(workspace): yield from workspace(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')) - diff --git a/tests/test_binarize.py b/tests/test_binarize.py index 4396aec..2f8a522 100644 --- a/tests/test_binarize.py +++ b/tests/test_binarize.py @@ -2,41 +2,42 @@ import json -from .assets import assets - from ocrd import run_processor from ocrd_kraken.binarize import KrakenBinarize +from .assets import assets + + PARAM_JSON = assets.url_of('param-binarize.json') def test_param_json(workspace_sbb): - ws = workspace_sbb run_processor(KrakenBinarize, - workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-BIN-KRAKEN", parameter=json.load(open(PARAM_JSON)), + **workspace_sbb, ) + ws = workspace_sbb['workspace'] ws.save_mets() def test_binarize_regions(workspace_aufklaerung): - ws = workspace_aufklaerung run_processor(KrakenBinarize, - workspace=ws, input_file_grp="OCR-D-GT-PAGE", output_file_grp="OCR-D-BIN-KRAKEN", parameter={'level-of-operation': 'region'}, + **workspace_aufklaerung, ) + ws = workspace_aufklaerung['workspace'] ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) def test_binarize_lines(workspace_aufklaerung): - ws = workspace_aufklaerung run_processor(KrakenBinarize, - workspace=ws, input_file_grp="OCR-D-GT-PAGE", output_file_grp="OCR-D-BIN-KRAKEN", parameter={'level-of-operation': 'line'}, + **workspace_aufklaerung, ) + ws = workspace_aufklaerung['workspace'] ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) diff --git a/tests/test_recognize.py b/tests/test_recognize.py index 290d7d0..eef425a 100644 --- a/tests/test_recognize.py +++ b/tests/test_recognize.py @@ -3,12 +3,13 @@ from ocrd import run_processor from ocrd_kraken.recognize import KrakenRecognize + def test_recognize(workspace_manifesto): - ws = workspace_manifesto run_processor(KrakenRecognize, - workspace=ws, input_file_grp="OCR-D-SEG-KRAKEN", output_file_grp="OCR-D-OCR-KRAKEN", + **workspace_manifesto, ) + ws = workspace_manifesto['workspace'] ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) diff --git a/tests/test_segment.py b/tests/test_segment.py index 27dc2d7..66a8ac6 100644 --- a/tests/test_segment.py +++ b/tests/test_segment.py @@ -3,36 +3,37 @@ from ocrd import run_processor from ocrd_kraken.segment import KrakenSegment + def test_run_blla(workspace_manifesto): - ws = workspace_manifesto run_processor(KrakenSegment, - workspace=ws, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", - parameter={'maxcolseps': 0, 'use_legacy': False} + parameter={'maxcolseps': 0, 'use_legacy': False}, + **workspace_manifesto, ) + ws = workspace_manifesto['workspace'] ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) def test_run_blla_regionlevel(workspace_aufklaerung_region): - ws = workspace_aufklaerung_region run_processor(KrakenSegment, - workspace=ws, input_file_grp="OCR-D-GT-SEG-REGION", output_file_grp="OCR-D-SEG-LINE-KRAKEN", page_id="phys_0005", - parameter={'maxcolseps': 0, 'use_legacy': False} + parameter={'maxcolseps': 0, 'use_legacy': False}, + **workspace_aufklaerung_region, ) + ws = workspace_aufklaerung_region['workspace'] ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) def test_run_legacy(workspace_manifesto): - ws = workspace_manifesto run_processor(KrakenSegment, - workspace=ws, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", - parameter={'maxcolseps': 0, 'use_legacy': True} + parameter={'maxcolseps': 0, 'use_legacy': True}, + **workspace_manifesto, ) + ws = workspace_manifesto['workspace'] ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) From 32b2e9c0ef9c23cd59f544f11afcad96c12b3f6a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 13:55:41 +0200 Subject: [PATCH 20/25] remove v2 tool facility --- ocrd_kraken/binarize.py | 2 -- ocrd_kraken/cli.py | 10 ---------- ocrd_kraken/config.py | 5 ----- ocrd_kraken/recognize.py | 3 +-- 4 files changed, 1 insertion(+), 19 deletions(-) delete mode 100644 ocrd_kraken/cli.py delete mode 100644 ocrd_kraken/config.py diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py index d8da6a4..49ceeca 100644 --- a/ocrd_kraken/binarize.py +++ b/ocrd_kraken/binarize.py @@ -11,8 +11,6 @@ from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml from ocrd_modelfactory import page_from_file -from ocrd_kraken.config import OCRD_TOOL - class KrakenBinarize(Processor): diff --git a/ocrd_kraken/cli.py b/ocrd_kraken/cli.py deleted file mode 100644 index ead681b..0000000 --- a/ocrd_kraken/cli.py +++ /dev/null @@ -1,10 +0,0 @@ -import click - -from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_kraken.binarize import KrakenBinarize - -@click.command() -@ocrd_cli_options -def ocrd_kraken_binarize(*args, **kwargs): - return ocrd_cli_wrap_processor(KrakenBinarize, *args, **kwargs) - diff --git a/ocrd_kraken/config.py b/ocrd_kraken/config.py deleted file mode 100644 index 1816957..0000000 --- a/ocrd_kraken/config.py +++ /dev/null @@ -1,5 +0,0 @@ -import json -from ocrd_utils import resource_filename - -with open(resource_filename('ocrd_kraken', 'ocrd-tool.json'), 'r', encoding='utf-8') as f: - OCRD_TOOL = json.load(f) diff --git a/ocrd_kraken/recognize.py b/ocrd_kraken/recognize.py index 4e97504..6daf0a6 100644 --- a/ocrd_kraken/recognize.py +++ b/ocrd_kraken/recognize.py @@ -9,7 +9,6 @@ from ocrd import Processor from ocrd_utils import ( - getLogger, coordinates_of_segment, coordinates_for_segment, bbox_from_polygon, @@ -32,13 +31,13 @@ WordType, GlyphType, CoordsType, - to_xml ) from ocrd_models.ocrd_page_generateds import ( ReadingDirectionSimpleType, TextLineOrderSimpleType ) + class KrakenRecognize(Processor): @property From c73b3efa6dd771db06d207303370fb0c71395fa4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 13:57:04 +0200 Subject: [PATCH 21/25] =?UTF-8?q?tests:=20use=20workspace=20manifesto?= =?UTF-8?q?=E2=86=92aufklaerung=20(1=E2=86=922=20pages),=20binarize=20ad?= =?UTF-8?q?=20hoc=20where=20needed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_recognize.py | 17 +++++++++++++---- tests/test_segment.py | 27 ++++++++++++++++++--------- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/tests/test_recognize.py b/tests/test_recognize.py index eef425a..3ebeb71 100644 --- a/tests/test_recognize.py +++ b/tests/test_recognize.py @@ -2,14 +2,23 @@ from ocrd import run_processor from ocrd_kraken.recognize import KrakenRecognize +from ocrd_kraken.binarize import KrakenBinarize -def test_recognize(workspace_manifesto): +def test_recognize(workspace_aufklaerung): + # some models (like default en) require binarized images + run_processor(KrakenBinarize, + input_file_grp="OCR-D-GT-PAGE", + output_file_grp="OCR-D-GT-PAGE-BIN", + **workspace_aufklaerung, + ) run_processor(KrakenRecognize, - input_file_grp="OCR-D-SEG-KRAKEN", + # re-use layout, overwrite text: + input_file_grp="OCR-D-GT-PAGE-BIN", output_file_grp="OCR-D-OCR-KRAKEN", - **workspace_manifesto, + parameter={'overwrite_text': True}, + **workspace_aufklaerung, ) - ws = workspace_manifesto['workspace'] + ws = workspace_aufklaerung['workspace'] ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) diff --git a/tests/test_segment.py b/tests/test_segment.py index 66a8ac6..ec7e1ee 100644 --- a/tests/test_segment.py +++ b/tests/test_segment.py @@ -2,16 +2,17 @@ from ocrd import run_processor from ocrd_kraken.segment import KrakenSegment +from ocrd_kraken.binarize import KrakenBinarize -def test_run_blla(workspace_manifesto): +def test_run_blla(workspace_aufklaerung): run_processor(KrakenSegment, - input_file_grp="OCR-D-IMG-BIN", + input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-LINE-KRAKEN", parameter={'maxcolseps': 0, 'use_legacy': False}, - **workspace_manifesto, + **workspace_aufklaerung, ) - ws = workspace_manifesto['workspace'] + ws = workspace_aufklaerung['workspace'] ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) @@ -19,6 +20,7 @@ def test_run_blla_regionlevel(workspace_aufklaerung_region): run_processor(KrakenSegment, input_file_grp="OCR-D-GT-SEG-REGION", output_file_grp="OCR-D-SEG-LINE-KRAKEN", + # only 1 page (takes 3min per page without GPU) page_id="phys_0005", parameter={'maxcolseps': 0, 'use_legacy': False}, **workspace_aufklaerung_region, @@ -27,13 +29,20 @@ def test_run_blla_regionlevel(workspace_aufklaerung_region): ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) -def test_run_legacy(workspace_manifesto): +def test_run_legacy(workspace_aufklaerung): + # legacy segmentation requires binarized images + run_processor(KrakenBinarize, + input_file_grp="OCR-D-GT-PAGE", + output_file_grp="OCR-D-GT-PAGE-BIN", + **workspace_aufklaerung, + ) run_processor(KrakenSegment, - input_file_grp="OCR-D-IMG-BIN", + # overwrite layout: + input_file_grp="OCR-D-GT-PAGE-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", - parameter={'maxcolseps': 0, 'use_legacy': True}, - **workspace_manifesto, + parameter={'maxcolseps': 0, 'use_legacy': True, 'overwrite_segments': True}, + **workspace_aufklaerung, ) - ws = workspace_manifesto['workspace'] + ws = workspace_aufklaerung['workspace'] ws.save_mets() # FIXME: add result assertions (find_files, parsing PAGE etc) From a23d4c31e567ac1adc5eb6c2fa4465d2d6301a80 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 13:58:07 +0200 Subject: [PATCH 22/25] tests: avoid running into 'too many failures' --- tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/conftest.py b/tests/conftest.py index 2f62241..b4868be 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,6 +21,7 @@ def _make_workspace(workspace_path): directory = str(tmpdir) resolver = Resolver() workspace = resolver.workspace_from_url(workspace_path, dst_dir=directory, download=True) + config.OCRD_MISSING_OUTPUT = "ABORT" if 'metscache' in request.param: config.OCRD_METS_CACHING = True print("enabled METS caching") From ae6445b7b1d72ee41a80f83cf9f75ed388307ae7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 13:58:40 +0200 Subject: [PATCH 23/25] update v3 requirement --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5639e86..75ab38b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 3.0.0a1 +ocrd >= 3.0.0b2 kraken >= 5.0 scipy shapely From fd15e2a04120e6753518d0c8670a38f883fbcb15 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 14:44:06 +0200 Subject: [PATCH 24/25] tests: add actual assertions --- tests/test_binarize.py | 22 ++++++++++++++++++++-- tests/test_recognize.py | 14 +++++++++++++- tests/test_segment.py | 23 ++++++++++++++++++++--- 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/tests/test_binarize.py b/tests/test_binarize.py index 2f8a522..da9adea 100644 --- a/tests/test_binarize.py +++ b/tests/test_binarize.py @@ -1,8 +1,13 @@ # pylint: disable=import-error import json +import os from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES +from ocrd_modelfactory import page_from_file + from ocrd_kraken.binarize import KrakenBinarize from .assets import assets @@ -10,6 +15,17 @@ PARAM_JSON = assets.url_of('param-binarize.json') +def analyse_result(ws, level): + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-BIN-KRAKEN')) + out_files = list(ws.find_files(fileGrp="OCR-D-BIN-KRAKEN", mimetype=MIMETYPE_PAGE)) + assert len(out_files), "found no output PAGE file" + out_images = list(ws.find_files(fileGrp="OCR-D-BIN-KRAKEN", mimetype="//^image/.*")) + assert len(out_images), "found no output image file" + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_images = out_pcgts.etree.xpath('//page:%s/page:AlternativeImage[contains(@comments,"binarized")]' % level, namespaces=NAMESPACES) + assert len(out_images) > 0, "found no binarized AlternativeImages in output PAGE file" + def test_param_json(workspace_sbb): run_processor(KrakenBinarize, input_file_grp="OCR-D-IMG", @@ -19,6 +35,7 @@ def test_param_json(workspace_sbb): ) ws = workspace_sbb['workspace'] ws.save_mets() + analyse_result(ws, 'Page') def test_binarize_regions(workspace_aufklaerung): run_processor(KrakenBinarize, @@ -29,7 +46,7 @@ def test_binarize_regions(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws, 'TextRegion') def test_binarize_lines(workspace_aufklaerung): run_processor(KrakenBinarize, @@ -40,4 +57,5 @@ def test_binarize_lines(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws, 'TextLine') + diff --git a/tests/test_recognize.py b/tests/test_recognize.py index 3ebeb71..8354a0e 100644 --- a/tests/test_recognize.py +++ b/tests/test_recognize.py @@ -1,6 +1,12 @@ # pylint: disable=import-error +import os + from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES +from ocrd_modelfactory import page_from_file + from ocrd_kraken.recognize import KrakenRecognize from ocrd_kraken.binarize import KrakenBinarize @@ -21,4 +27,10 @@ def test_recognize(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-OCR-KRAKEN')) + results = ws.find_files(file_grp='OCR-D-OCR-KRAKEN', mimetype=MIMETYPE_PAGE) + result0 = next(results, False) + assert result0, "found no output PAGE file" + result0 = page_from_file(result0) + text0 = result0.etree.xpath('//page:Glyph/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) + assert len(text0) > 0, "found no glyph text in output PAGE file" diff --git a/tests/test_segment.py b/tests/test_segment.py index ec7e1ee..6c00880 100644 --- a/tests/test_segment.py +++ b/tests/test_segment.py @@ -1,10 +1,27 @@ # pylint: disable=import-error +import os + from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES +from ocrd_modelfactory import page_from_file + from ocrd_kraken.segment import KrakenSegment from ocrd_kraken.binarize import KrakenBinarize +def analyse_result(ws): + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-SEG-LINE-KRAKEN')) + out_files = list(ws.find_files(fileGrp="OCR-D-SEG-LINE-KRAKEN", mimetype=MIMETYPE_PAGE)) + assert len(out_files), "found no output PAGE file" + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_regions = out_pcgts.etree.xpath('//page:TextRegion/page:Coords', namespaces=NAMESPACES) + assert len(out_regions) > 0, "found no text regions in output PAGE file" + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines), "found no text lines in output PAGE file" + def test_run_blla(workspace_aufklaerung): run_processor(KrakenSegment, input_file_grp="OCR-D-IMG", @@ -14,7 +31,7 @@ def test_run_blla(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws) def test_run_blla_regionlevel(workspace_aufklaerung_region): run_processor(KrakenSegment, @@ -27,7 +44,7 @@ def test_run_blla_regionlevel(workspace_aufklaerung_region): ) ws = workspace_aufklaerung_region['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws) def test_run_legacy(workspace_aufklaerung): # legacy segmentation requires binarized images @@ -45,4 +62,4 @@ def test_run_legacy(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws) From 43a88ea2111fc8e176600ecd09694344292032b1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 15:26:24 +0200 Subject: [PATCH 25/25] update v3 requirement --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 75ab38b..0907187 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 3.0.0b2 +ocrd >= 3.0.0b3 kraken >= 5.0 scipy shapely