Faster polygonal line extraction

commit 432422a27615824bc5dd53086d79e0848fdbcb94 Author: Robin Champenois <[email protected]> Date: Fri Mar 1 14:25:19 2024 +0100 Better legacy polygon arrow behavior commit aeaa6890cd3952ccb7008f021bbaee6fbfd62d93 Author: Robin Champenois <[email protected]> Date: Tue Feb 27 19:20:41 2024 +0100 Tests for arrow dataset and new polygons commit be0083ca02832766b0fe25caba0d8171868e3ec6 Author: Robin Champenois <[email protected]> Date: Tue Feb 27 17:29:25 2024 +0100 Handle extract_polygons toggle for Arrow Datasets commit d830fb7a3418aa85340289fd9fac05e9d5635bdc Author: Robin Champenois <[email protected]> Date: Mon Feb 26 16:57:33 2024 +0100 Improve legacy polygon tests commit b99b7882ee3f38c16d9fb41edbf6af343d527368 Author: Robin Champenois <[email protected]> Date: Mon Feb 26 16:01:31 2024 +0100 Full new polygon extraction tests commit d3398797faa5216e61682bfd62474bb720c325bf Author: Robin Champenois <[email protected]> Date: Mon Feb 26 13:31:40 2024 +0100 [WIP] test the application of new polygons commit 16be09173b7eb2e5c2c00b03254b6e246e7e158c Author: Benjamin Kiessling <[email protected]> Date: Tue Mar 26 10:28:31 2024 +0100 [WIP] legacy polygon flag system commit 197569e0ce28df80613b9b666231e49c902dcb96 Author: Benjamin Kiessling <[email protected]> Date: Tue Mar 26 10:27:57 2024 +0100 Fix tests commit 29c7266e802370c356f64b22aa9817e8df721937 Author: Robin Champenois <[email protected]> Date: Mon Dec 4 13:43:20 2023 +0100 Faster and cleaner extract_polygons and _rotate
mittagessen · Mar 26, 2024 · 3201fdd · 3201fdd
1 parent 07d33d6
commit 3201fdd
Show file tree

Hide file tree

Showing 17 changed files with 1,027 additions and 149 deletions.
diff --git a/kraken/contrib/extract_lines.py b/kraken/contrib/extract_lines.py
@@ -9,8 +9,9 @@
               'link to source images.')
 @click.option('-i', '--model', default=None, show_default=True, type=click.Path(exists=True),
               help='Baseline detection model to use. Overrides format type and expects image files as input.')
+@click.option('--legacy-polygons', is_flag=True, help='Use the legacy polygon extractor.')
 @click.argument('files', nargs=-1)
-def cli(format_type, model, files):
+def cli(format_type, model, legacy_polygons, files):
     """
     A small script extracting rectified line polygons as defined in either ALTO or
     PageXML files or run a model to do the same.
@@ -37,7 +38,7 @@ def cli(format_type, model, files):
                 data = xml.XMLPage(doc, format_type)
                 if len(data.lines) > 0:
                     bounds = data.to_container()
-                    for idx, (im, box) in enumerate(segmentation.extract_polygons(Image.open(bounds.imagename), bounds)):
+                    for idx, (im, box) in enumerate(segmentation.extract_polygons(Image.open(bounds.imagename), bounds, legacy=legacy_polygons)):
                         click.echo('.', nl=False)
                         im.save('{}.{}.jpg'.format(splitext(bounds.imagename)[0], idx))
                         with open('{}.{}.gt.txt'.format(splitext(bounds.imagename)[0], idx), 'w') as fp:
@@ -61,7 +62,7 @@ def cli(format_type, model, files):
             click.echo(f'Processing {doc} ', nl=False)
             full_im = Image.open(doc)
             bounds = blla.segment(full_im, model=net)
-            for idx, (im, box) in enumerate(segmentation.extract_polygons(full_im, bounds)):
+            for idx, (im, box) in enumerate(segmentation.extract_polygons(full_im, bounds, legacy=legacy_polygons)):
                 click.echo('.', nl=False)
                 im.save('{}.{}.jpg'.format(splitext(doc)[0], idx))
 

diff --git a/kraken/ketos/dataset.py b/kraken/ketos/dataset.py
@@ -55,9 +55,11 @@
               help='Minimum number of records per RecordBatch written to the '
                    'output file. Larger batches require more transient memory '
                    'but slightly improve reading performance.')
+@click.option('--legacy-polygons', show_default=True, default=False, is_flag=True,
+                help='Use the old polygon extractor.')
 @click.argument('ground_truth', nargs=-1, type=click.Path(exists=True, dir_okay=False))
 def compile(ctx, output, workers, format_type, files, random_split, force_type,
-            save_splits, skip_empty_lines, recordbatch_size, ground_truth):
+            save_splits, skip_empty_lines, recordbatch_size, ground_truth, legacy_polygons):
     """
     Precompiles a binary dataset from a collection of XML files.
     """
@@ -91,6 +93,7 @@ def compile(ctx, output, workers, format_type, files, random_split, force_type,
                                            force_type,
                                            recordbatch_size,
                                            skip_empty_lines,
-                                           lambda advance, total: progress.update(extract_task, total=total, advance=advance))
+                                           lambda advance, total: progress.update(extract_task, total=total, advance=advance),
+                                           legacy_polygons=legacy_polygons)
 
     message(f'Output file written to {output}')
diff --git a/kraken/ketos/pretrain.py b/kraken/ketos/pretrain.py
@@ -133,7 +133,7 @@
 @click.option('-e', '--evaluation-files', show_default=True, default=None, multiple=True,
               callback=_validate_manifests, type=click.File(mode='r', lazy=True),
               help='File(s) with paths to evaluation data. Overrides the `-p` parameter')
-@click.option('--workers', show_default=True, default=1, type=click.IntRange(1), help='Number of worker processes.')
+@click.option('--workers', show_default=True, default=1, type=click.IntRange(0), help='Number of worker processes.')
 @click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
 @click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
               help='When loading an existing model, retrieve hyperparameters from the model')
@@ -179,14 +179,15 @@
               default=RECOGNITION_PRETRAIN_HYPER_PARAMS['logit_temp'],
               help='Multiplicative factor for the logits used in contrastive loss.')
 @click.argument('ground_truth', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False))
+@click.option('--legacy-polygons', show_default=True, default=False, is_flag=True, help='Use the legacy polygon extractor.')
 def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
              min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum,
              weight_decay, warmup, schedule, gamma, step_size, sched_patience,
              cos_max, partition, fixed_splits, training_files,
              evaluation_files, workers, threads, load_hyper_parameters, repolygonize,
              force_binarization, format_type, augment,
              mask_probability, mask_width, num_negatives, logit_temp,
-             ground_truth):
+             ground_truth, legacy_polygons):
     """
     Trains a model from image-text pairs.
     """
@@ -258,7 +259,8 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
                                      output=output,
                                      spec=spec,
                                      model=load,
-                                     load_hyper_parameters=load_hyper_parameters)
+                                     load_hyper_parameters=load_hyper_parameters,
+                                     legacy_polygons=legacy_polygons)
 
     data_module = PretrainDataModule(batch_size=hyper_params.pop('batch_size'),
                                      pad=hyper_params.pop('pad'),
@@ -273,7 +275,8 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
                                      channels=model.channels,
                                      repolygonize=repolygonize,
                                      force_binarization=force_binarization,
-                                     format_type=format_type)
+                                     format_type=format_type,
+                                     legacy_polygons=legacy_polygons,)
 
     model.len_train_set = len(data_module.train_dataloader())
 

diff --git a/kraken/ketos/recognition.py b/kraken/ketos/recognition.py
@@ -21,6 +21,8 @@
 import logging
 import pathlib
 from typing import List
+from functools import partial
+import warnings
 
 import click
 from threadpoolctl import threadpool_limits
@@ -157,7 +159,7 @@
 @click.option('-e', '--evaluation-files', show_default=True, default=None, multiple=True,
               callback=_validate_manifests, type=click.File(mode='r', lazy=True),
               help='File(s) with paths to evaluation data. Overrides the `-p` parameter')
-@click.option('--workers', show_default=True, default=1, type=click.IntRange(1), help='Number of worker processes.')
+@click.option('--workers', show_default=True, default=1, type=click.IntRange(0), help='Number of worker processes.')
 @click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
 @click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
               help='When loading an existing model, retrieve hyperparameters from the model')
@@ -190,14 +192,15 @@
 @click.option('--log-dir', show_default=True, type=click.Path(exists=True, dir_okay=True, writable=True),
               help='Path to directory where the logger will store the logs. If not set, a directory will be created in the current working directory.')
 @click.argument('ground_truth', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False))
+@click.option('--legacy-polygons', show_default=True, default=False, is_flag=True, help='Use the legacy polygon extractor.')
 def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
           min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum,
           weight_decay, warmup, freeze_backbone, schedule, gamma, step_size,
           sched_patience, cos_max, partition, fixed_splits, normalization,
           normalize_whitespace, codec, resize, reorder, base_dir,
           training_files, evaluation_files, workers, threads, load_hyper_parameters,
           repolygonize, force_binarization, format_type, augment,
-          pl_logger, log_dir, ground_truth):
+          pl_logger, log_dir, ground_truth, legacy_polygons):
     """
     Trains a model from image-text pairs.
     """
@@ -300,7 +303,19 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
                              force_binarization=force_binarization,
                              format_type=format_type,
                              codec=codec,
-                             resize=resize)
+                             resize=resize,
+                             legacy_polygons=legacy_polygons)
+
+    # Force upgrade to new polygon extractor if model was not trained with it
+    if model.nn and model.nn.use_legacy_polygons:
+        if not legacy_polygons and not model.legacy_polygons:
+            # upgrade to new polygon extractor
+            logger.warning('The model will be flagged to use new polygon extractor.')
+            model.nn.use_legacy_polygons = False
+    if not model.nn and legacy_polygons != model.legacy_polygons:
+        logger.warning(f'Dataset was compiled with legacy polygon extractor: {model.legacy_polygons}, '
+                       f'the new model will be flagged to use {"legacy" if model.legacy_polygons else "new"} method.')
+        legacy_polygons = model.legacy_polygons
 
     trainer = KrakenTrainer(accelerator=accelerator,
                             devices=device,
@@ -349,7 +364,7 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
 @click.option('--pad', show_default=True, type=click.INT, default=16, help='Left and right '
               'padding around lines')
 @click.option('--workers', show_default=True, default=1,
-              type=click.IntRange(1),
+              type=click.IntRange(0),
               help='Number of worker processes when running on CPU.')
 @click.option('--threads', show_default=True, default=1,
               type=click.IntRange(1),
@@ -387,9 +402,10 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
 @click.option('--fixed-splits/--ignore-fixed-split', show_default=True, default=False,
               help='Whether to honor fixed splits in binary datasets.')
 @click.argument('test_set', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False))
+@click.option('--no-legacy-polygons', show_default=True, default=False, is_flag=True, help='Force disable the legacy polygon extractor.')
 def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
          threads, reorder, base_dir, normalization, normalize_whitespace,
-         repolygonize, force_binarization, format_type, fixed_splits, test_set):
+         repolygonize, force_binarization, format_type, fixed_splits, test_set, no_legacy_polygons):
     """
     Evaluate on a test set.
     """
@@ -410,11 +426,28 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
 
     logger.info('Building test set from {} line images'.format(len(test_set) + len(evaluation_files)))
 
+    legacy_polygons = None
+    incoherent_legacy_polygons = False
+
     nn = {}
     for p in model:
         message('Loading model {}\t'.format(p), nl=False)
         nn[p] = models.load_any(p, device)
         message('\u2713', fg='green')
+        model_legacy_polygons = nn[p].nn.use_legacy_polygons
+        if legacy_polygons is None:
+            legacy_polygons = model_legacy_polygons
+        elif legacy_polygons != model_legacy_polygons:
+            incoherent_legacy_polygons = True
+
+    if incoherent_legacy_polygons and not no_legacy_polygons:
+        logger.warning('Models use different polygon extractors. Legacy polygon extractor will be used ; use --no-legacy-polygons to force disable it.')
+        legacy_polygons = True
+    elif no_legacy_polygons:
+        legacy_polygons = False
+
+    if legacy_polygons:
+        warnings.warn('Using legacy polygon extractor, as the model was not trained with the new method. Please retrain your model to get performance improvements.')
 
     pin_ds_mem = False
     if device != 'cpu':
@@ -440,7 +473,7 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
             message('Repolygonizing data')
         test_set = [{'page': XMLPage(file, filetype=format_type).to_container()} for file in test_set]
         valid_norm = False
-        DatasetClass = PolygonGTDataset
+        DatasetClass = partial(PolygonGTDataset, legacy_polygons=legacy_polygons)
     elif format_type == 'binary':
         DatasetClass = ArrowIPCRecognitionDataset
         if repolygonize:
@@ -485,6 +518,13 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
                     ds.add(**line)
                 except ValueError as e:
                     logger.info(e)
+
+            if hasattr(ds, 'legacy_polygon_status'):
+                if ds.legacy_polygons_status != legacy_polygons:
+                    warnings.warn(
+                        f'Binary dataset was compiled with legacy polygon extractor: {ds.legacy_polygon_status}, '
+                        f'while expecting data extracted with {"legacy" if legacy_polygons else "new"} method. Results may be inaccurate.')
+
             # don't encode validation set as the alphabets may not match causing encoding failures
             ds.no_encode()
             ds_loader = DataLoader(ds,

diff --git a/kraken/ketos/ro.py b/kraken/ketos/ro.py
@@ -123,7 +123,7 @@
 @click.option('-e', '--evaluation-files', show_default=True, default=None, multiple=True,
               callback=_validate_manifests, type=click.File(mode='r', lazy=True),
               help='File(s) with paths to evaluation data. Overrides the `-p` parameter')
-@click.option('--workers', show_default=True, default=1, type=click.IntRange(1), help='Number of worker proesses.')
+@click.option('--workers', show_default=True, default=1, type=click.IntRange(0), help='Number of worker proesses.')
 @click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
 @click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
               help='When loading an existing model, retrieve hyper-parameters from the model')

diff --git a/kraken/ketos/segmentation.py b/kraken/ketos/segmentation.py
@@ -159,7 +159,7 @@ def _validate_merging(ctx, param, value):
 @click.option('-e', '--evaluation-files', show_default=True, default=None, multiple=True,
               callback=_validate_manifests, type=click.File(mode='r', lazy=True),
               help='File(s) with paths to evaluation data. Overrides the `-p` parameter')
-@click.option('--workers', show_default=True, default=1, type=click.IntRange(1), help='Number of worker proesses.')
+@click.option('--workers', show_default=True, default=1, type=click.IntRange(0), help='Number of worker proesses.')
 @click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
 @click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
               help='When loading an existing model, retrieve hyper-parameters from the model')
@@ -382,7 +382,7 @@ def segtrain(ctx, output, spec, line_width, pad, load, freq, quit, epochs,
               callback=_validate_manifests, type=click.File(mode='r', lazy=True),
               help='File(s) with paths to evaluation data.')
 @click.option('-d', '--device', show_default=True, default='cpu', help='Select device to use (cpu, cuda:0, cuda:1, ...)')
-@click.option('--workers', default=1, show_default=True, type=click.IntRange(1),
+@click.option('--workers', default=1, show_default=True, type=click.IntRange(0),
               help='Number of worker processes for data loading.')
 @click.option('--threads', default=1, show_default=True, type=click.IntRange(1),
               help='Size of thread pools for intra-op parallelization')

diff --git a/kraken/kraken.py b/kraken/kraken.py
@@ -227,10 +227,12 @@ def recognizer(model, pad, no_segmentation, bidi_reordering, tags_ignore, input,
     if bounds.script_detection:
         it = rpred.mm_rpred(model, im, bounds, pad,
                             bidi_reordering=bidi_reordering,
-                            tags_ignore=tags_ignore)
+                            tags_ignore=tags_ignore,
+                            no_legacy_polygons=ctx.meta['no_legacy_polygons'])
     else:
         it = rpred.rpred(model['default'], im, bounds, pad,
-                         bidi_reordering=bidi_reordering)
+                         bidi_reordering=bidi_reordering,
+                         no_legacy_polygons=ctx.meta['no_legacy_polygons'])
 
     preds = []
 
@@ -302,8 +304,10 @@ def recognizer(model, pad, no_segmentation, bidi_reordering, tags_ignore, input,
               help='On compatible devices, uses autocast for `segment` which lower the memory usage.')
 @click.option('--threads', default=1, show_default=True, type=click.IntRange(1),
               help='Size of thread pools for intra-op parallelization')
+@click.option('--no-legacy-polygons', 'no_legacy_polygons', is_flag=True, default=False,
+              help="Force disable legacy polygon extraction")
 def cli(input, batch_input, suffix, verbose, format_type, pdf_format,
-        serializer, template, device, raise_on_error, autocast, threads):
+        serializer, template, device, raise_on_error, autocast, threads, no_legacy_polygons):
     """
     Base command for recognition functionality.
 
@@ -334,6 +338,8 @@ def cli(input, batch_input, suffix, verbose, format_type, pdf_format,
     ctx.meta['steps'] = []
     ctx.meta["autocast"] = autocast
     ctx.meta['threads'] = threads
+    ctx.meta['no_legacy_polygons'] = no_legacy_polygons
+
     log.set_logger(logger, level=30 - min(10 * verbose, 20))