Skip to content

Commit

Permalink
Faster polygonal line extraction
Browse files Browse the repository at this point in the history
commit 432422a27615824bc5dd53086d79e0848fdbcb94
Author: Robin Champenois <[email protected]>
Date:   Fri Mar 1 14:25:19 2024 +0100

    Better legacy polygon arrow behavior

commit aeaa6890cd3952ccb7008f021bbaee6fbfd62d93
Author: Robin Champenois <[email protected]>
Date:   Tue Feb 27 19:20:41 2024 +0100

    Tests for arrow dataset and new polygons

commit be0083ca02832766b0fe25caba0d8171868e3ec6
Author: Robin Champenois <[email protected]>
Date:   Tue Feb 27 17:29:25 2024 +0100

    Handle extract_polygons toggle for Arrow Datasets

commit d830fb7a3418aa85340289fd9fac05e9d5635bdc
Author: Robin Champenois <[email protected]>
Date:   Mon Feb 26 16:57:33 2024 +0100

    Improve legacy polygon tests

commit b99b7882ee3f38c16d9fb41edbf6af343d527368
Author: Robin Champenois <[email protected]>
Date:   Mon Feb 26 16:01:31 2024 +0100

    Full new polygon extraction tests

commit d3398797faa5216e61682bfd62474bb720c325bf
Author: Robin Champenois <[email protected]>
Date:   Mon Feb 26 13:31:40 2024 +0100

    [WIP] test the application of new polygons

commit 16be09173b7eb2e5c2c00b03254b6e246e7e158c
Author: Benjamin Kiessling <[email protected]>
Date:   Tue Mar 26 10:28:31 2024 +0100

    [WIP] legacy polygon flag system

commit 197569e0ce28df80613b9b666231e49c902dcb96
Author: Benjamin Kiessling <[email protected]>
Date:   Tue Mar 26 10:27:57 2024 +0100

    Fix tests

commit 29c7266e802370c356f64b22aa9817e8df721937
Author: Robin Champenois <[email protected]>
Date:   Mon Dec 4 13:43:20 2023 +0100

    Faster and cleaner extract_polygons and _rotate
  • Loading branch information
mittagessen committed Mar 26, 2024
1 parent 07d33d6 commit 3201fdd
Show file tree
Hide file tree
Showing 17 changed files with 1,027 additions and 149 deletions.
7 changes: 4 additions & 3 deletions kraken/contrib/extract_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
'link to source images.')
@click.option('-i', '--model', default=None, show_default=True, type=click.Path(exists=True),
help='Baseline detection model to use. Overrides format type and expects image files as input.')
@click.option('--legacy-polygons', is_flag=True, help='Use the legacy polygon extractor.')
@click.argument('files', nargs=-1)
def cli(format_type, model, files):
def cli(format_type, model, legacy_polygons, files):
"""
A small script extracting rectified line polygons as defined in either ALTO or
PageXML files or run a model to do the same.
Expand All @@ -37,7 +38,7 @@ def cli(format_type, model, files):
data = xml.XMLPage(doc, format_type)
if len(data.lines) > 0:
bounds = data.to_container()
for idx, (im, box) in enumerate(segmentation.extract_polygons(Image.open(bounds.imagename), bounds)):
for idx, (im, box) in enumerate(segmentation.extract_polygons(Image.open(bounds.imagename), bounds, legacy=legacy_polygons)):
click.echo('.', nl=False)
im.save('{}.{}.jpg'.format(splitext(bounds.imagename)[0], idx))
with open('{}.{}.gt.txt'.format(splitext(bounds.imagename)[0], idx), 'w') as fp:
Expand All @@ -61,7 +62,7 @@ def cli(format_type, model, files):
click.echo(f'Processing {doc} ', nl=False)
full_im = Image.open(doc)
bounds = blla.segment(full_im, model=net)
for idx, (im, box) in enumerate(segmentation.extract_polygons(full_im, bounds)):
for idx, (im, box) in enumerate(segmentation.extract_polygons(full_im, bounds, legacy=legacy_polygons)):
click.echo('.', nl=False)
im.save('{}.{}.jpg'.format(splitext(doc)[0], idx))

Expand Down
7 changes: 5 additions & 2 deletions kraken/ketos/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,11 @@
help='Minimum number of records per RecordBatch written to the '
'output file. Larger batches require more transient memory '
'but slightly improve reading performance.')
@click.option('--legacy-polygons', show_default=True, default=False, is_flag=True,
help='Use the old polygon extractor.')
@click.argument('ground_truth', nargs=-1, type=click.Path(exists=True, dir_okay=False))
def compile(ctx, output, workers, format_type, files, random_split, force_type,
save_splits, skip_empty_lines, recordbatch_size, ground_truth):
save_splits, skip_empty_lines, recordbatch_size, ground_truth, legacy_polygons):
"""
Precompiles a binary dataset from a collection of XML files.
"""
Expand Down Expand Up @@ -91,6 +93,7 @@ def compile(ctx, output, workers, format_type, files, random_split, force_type,
force_type,
recordbatch_size,
skip_empty_lines,
lambda advance, total: progress.update(extract_task, total=total, advance=advance))
lambda advance, total: progress.update(extract_task, total=total, advance=advance),
legacy_polygons=legacy_polygons)

message(f'Output file written to {output}')
11 changes: 7 additions & 4 deletions kraken/ketos/pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
@click.option('-e', '--evaluation-files', show_default=True, default=None, multiple=True,
callback=_validate_manifests, type=click.File(mode='r', lazy=True),
help='File(s) with paths to evaluation data. Overrides the `-p` parameter')
@click.option('--workers', show_default=True, default=1, type=click.IntRange(1), help='Number of worker processes.')
@click.option('--workers', show_default=True, default=1, type=click.IntRange(0), help='Number of worker processes.')
@click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
@click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
help='When loading an existing model, retrieve hyperparameters from the model')
Expand Down Expand Up @@ -179,14 +179,15 @@
default=RECOGNITION_PRETRAIN_HYPER_PARAMS['logit_temp'],
help='Multiplicative factor for the logits used in contrastive loss.')
@click.argument('ground_truth', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False))
@click.option('--legacy-polygons', show_default=True, default=False, is_flag=True, help='Use the legacy polygon extractor.')
def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum,
weight_decay, warmup, schedule, gamma, step_size, sched_patience,
cos_max, partition, fixed_splits, training_files,
evaluation_files, workers, threads, load_hyper_parameters, repolygonize,
force_binarization, format_type, augment,
mask_probability, mask_width, num_negatives, logit_temp,
ground_truth):
ground_truth, legacy_polygons):
"""
Trains a model from image-text pairs.
"""
Expand Down Expand Up @@ -258,7 +259,8 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
output=output,
spec=spec,
model=load,
load_hyper_parameters=load_hyper_parameters)
load_hyper_parameters=load_hyper_parameters,
legacy_polygons=legacy_polygons)

data_module = PretrainDataModule(batch_size=hyper_params.pop('batch_size'),
pad=hyper_params.pop('pad'),
Expand All @@ -273,7 +275,8 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
channels=model.channels,
repolygonize=repolygonize,
force_binarization=force_binarization,
format_type=format_type)
format_type=format_type,
legacy_polygons=legacy_polygons,)

model.len_train_set = len(data_module.train_dataloader())

Expand Down
52 changes: 46 additions & 6 deletions kraken/ketos/recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import logging
import pathlib
from typing import List
from functools import partial
import warnings

import click
from threadpoolctl import threadpool_limits
Expand Down Expand Up @@ -157,7 +159,7 @@
@click.option('-e', '--evaluation-files', show_default=True, default=None, multiple=True,
callback=_validate_manifests, type=click.File(mode='r', lazy=True),
help='File(s) with paths to evaluation data. Overrides the `-p` parameter')
@click.option('--workers', show_default=True, default=1, type=click.IntRange(1), help='Number of worker processes.')
@click.option('--workers', show_default=True, default=1, type=click.IntRange(0), help='Number of worker processes.')
@click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
@click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
help='When loading an existing model, retrieve hyperparameters from the model')
Expand Down Expand Up @@ -190,14 +192,15 @@
@click.option('--log-dir', show_default=True, type=click.Path(exists=True, dir_okay=True, writable=True),
help='Path to directory where the logger will store the logs. If not set, a directory will be created in the current working directory.')
@click.argument('ground_truth', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False))
@click.option('--legacy-polygons', show_default=True, default=False, is_flag=True, help='Use the legacy polygon extractor.')
def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum,
weight_decay, warmup, freeze_backbone, schedule, gamma, step_size,
sched_patience, cos_max, partition, fixed_splits, normalization,
normalize_whitespace, codec, resize, reorder, base_dir,
training_files, evaluation_files, workers, threads, load_hyper_parameters,
repolygonize, force_binarization, format_type, augment,
pl_logger, log_dir, ground_truth):
pl_logger, log_dir, ground_truth, legacy_polygons):
"""
Trains a model from image-text pairs.
"""
Expand Down Expand Up @@ -300,7 +303,19 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
force_binarization=force_binarization,
format_type=format_type,
codec=codec,
resize=resize)
resize=resize,
legacy_polygons=legacy_polygons)

# Force upgrade to new polygon extractor if model was not trained with it
if model.nn and model.nn.use_legacy_polygons:
if not legacy_polygons and not model.legacy_polygons:
# upgrade to new polygon extractor
logger.warning('The model will be flagged to use new polygon extractor.')
model.nn.use_legacy_polygons = False
if not model.nn and legacy_polygons != model.legacy_polygons:
logger.warning(f'Dataset was compiled with legacy polygon extractor: {model.legacy_polygons}, '
f'the new model will be flagged to use {"legacy" if model.legacy_polygons else "new"} method.')
legacy_polygons = model.legacy_polygons

trainer = KrakenTrainer(accelerator=accelerator,
devices=device,
Expand Down Expand Up @@ -349,7 +364,7 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
@click.option('--pad', show_default=True, type=click.INT, default=16, help='Left and right '
'padding around lines')
@click.option('--workers', show_default=True, default=1,
type=click.IntRange(1),
type=click.IntRange(0),
help='Number of worker processes when running on CPU.')
@click.option('--threads', show_default=True, default=1,
type=click.IntRange(1),
Expand Down Expand Up @@ -387,9 +402,10 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
@click.option('--fixed-splits/--ignore-fixed-split', show_default=True, default=False,
help='Whether to honor fixed splits in binary datasets.')
@click.argument('test_set', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False))
@click.option('--no-legacy-polygons', show_default=True, default=False, is_flag=True, help='Force disable the legacy polygon extractor.')
def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
threads, reorder, base_dir, normalization, normalize_whitespace,
repolygonize, force_binarization, format_type, fixed_splits, test_set):
repolygonize, force_binarization, format_type, fixed_splits, test_set, no_legacy_polygons):
"""
Evaluate on a test set.
"""
Expand All @@ -410,11 +426,28 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,

logger.info('Building test set from {} line images'.format(len(test_set) + len(evaluation_files)))

legacy_polygons = None
incoherent_legacy_polygons = False

nn = {}
for p in model:
message('Loading model {}\t'.format(p), nl=False)
nn[p] = models.load_any(p, device)
message('\u2713', fg='green')
model_legacy_polygons = nn[p].nn.use_legacy_polygons
if legacy_polygons is None:
legacy_polygons = model_legacy_polygons
elif legacy_polygons != model_legacy_polygons:
incoherent_legacy_polygons = True

if incoherent_legacy_polygons and not no_legacy_polygons:
logger.warning('Models use different polygon extractors. Legacy polygon extractor will be used ; use --no-legacy-polygons to force disable it.')
legacy_polygons = True
elif no_legacy_polygons:
legacy_polygons = False

if legacy_polygons:
warnings.warn('Using legacy polygon extractor, as the model was not trained with the new method. Please retrain your model to get performance improvements.')

pin_ds_mem = False
if device != 'cpu':
Expand All @@ -440,7 +473,7 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
message('Repolygonizing data')
test_set = [{'page': XMLPage(file, filetype=format_type).to_container()} for file in test_set]
valid_norm = False
DatasetClass = PolygonGTDataset
DatasetClass = partial(PolygonGTDataset, legacy_polygons=legacy_polygons)
elif format_type == 'binary':
DatasetClass = ArrowIPCRecognitionDataset
if repolygonize:
Expand Down Expand Up @@ -485,6 +518,13 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
ds.add(**line)
except ValueError as e:
logger.info(e)

if hasattr(ds, 'legacy_polygon_status'):
if ds.legacy_polygons_status != legacy_polygons:
warnings.warn(
f'Binary dataset was compiled with legacy polygon extractor: {ds.legacy_polygon_status}, '
f'while expecting data extracted with {"legacy" if legacy_polygons else "new"} method. Results may be inaccurate.')

# don't encode validation set as the alphabets may not match causing encoding failures
ds.no_encode()
ds_loader = DataLoader(ds,
Expand Down
2 changes: 1 addition & 1 deletion kraken/ketos/ro.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@
@click.option('-e', '--evaluation-files', show_default=True, default=None, multiple=True,
callback=_validate_manifests, type=click.File(mode='r', lazy=True),
help='File(s) with paths to evaluation data. Overrides the `-p` parameter')
@click.option('--workers', show_default=True, default=1, type=click.IntRange(1), help='Number of worker proesses.')
@click.option('--workers', show_default=True, default=1, type=click.IntRange(0), help='Number of worker proesses.')
@click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
@click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
help='When loading an existing model, retrieve hyper-parameters from the model')
Expand Down
4 changes: 2 additions & 2 deletions kraken/ketos/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def _validate_merging(ctx, param, value):
@click.option('-e', '--evaluation-files', show_default=True, default=None, multiple=True,
callback=_validate_manifests, type=click.File(mode='r', lazy=True),
help='File(s) with paths to evaluation data. Overrides the `-p` parameter')
@click.option('--workers', show_default=True, default=1, type=click.IntRange(1), help='Number of worker proesses.')
@click.option('--workers', show_default=True, default=1, type=click.IntRange(0), help='Number of worker proesses.')
@click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
@click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
help='When loading an existing model, retrieve hyper-parameters from the model')
Expand Down Expand Up @@ -382,7 +382,7 @@ def segtrain(ctx, output, spec, line_width, pad, load, freq, quit, epochs,
callback=_validate_manifests, type=click.File(mode='r', lazy=True),
help='File(s) with paths to evaluation data.')
@click.option('-d', '--device', show_default=True, default='cpu', help='Select device to use (cpu, cuda:0, cuda:1, ...)')
@click.option('--workers', default=1, show_default=True, type=click.IntRange(1),
@click.option('--workers', default=1, show_default=True, type=click.IntRange(0),
help='Number of worker processes for data loading.')
@click.option('--threads', default=1, show_default=True, type=click.IntRange(1),
help='Size of thread pools for intra-op parallelization')
Expand Down
12 changes: 9 additions & 3 deletions kraken/kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,12 @@ def recognizer(model, pad, no_segmentation, bidi_reordering, tags_ignore, input,
if bounds.script_detection:
it = rpred.mm_rpred(model, im, bounds, pad,
bidi_reordering=bidi_reordering,
tags_ignore=tags_ignore)
tags_ignore=tags_ignore,
no_legacy_polygons=ctx.meta['no_legacy_polygons'])
else:
it = rpred.rpred(model['default'], im, bounds, pad,
bidi_reordering=bidi_reordering)
bidi_reordering=bidi_reordering,
no_legacy_polygons=ctx.meta['no_legacy_polygons'])

preds = []

Expand Down Expand Up @@ -302,8 +304,10 @@ def recognizer(model, pad, no_segmentation, bidi_reordering, tags_ignore, input,
help='On compatible devices, uses autocast for `segment` which lower the memory usage.')
@click.option('--threads', default=1, show_default=True, type=click.IntRange(1),
help='Size of thread pools for intra-op parallelization')
@click.option('--no-legacy-polygons', 'no_legacy_polygons', is_flag=True, default=False,
help="Force disable legacy polygon extraction")
def cli(input, batch_input, suffix, verbose, format_type, pdf_format,
serializer, template, device, raise_on_error, autocast, threads):
serializer, template, device, raise_on_error, autocast, threads, no_legacy_polygons):
"""
Base command for recognition functionality.
Expand Down Expand Up @@ -334,6 +338,8 @@ def cli(input, batch_input, suffix, verbose, format_type, pdf_format,
ctx.meta['steps'] = []
ctx.meta["autocast"] = autocast
ctx.meta['threads'] = threads
ctx.meta['no_legacy_polygons'] = no_legacy_polygons

log.set_logger(logger, level=30 - min(10 * verbose, 20))


Expand Down
Loading

0 comments on commit 3201fdd

Please sign in to comment.