Fix cosine annealing scheduling in training parts

mittagessen · Apr 3, 2024 · f7fb622 · f7fb622
1 parent 5263797
commit f7fb622
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 15 deletions.
diff --git a/kraken/ketos/pretrain.py b/kraken/ketos/pretrain.py
@@ -123,6 +123,10 @@
               show_default=True,
               default=RECOGNITION_PRETRAIN_HYPER_PARAMS['cos_t_max'],
               help='Epoch of minimal learning rate for cosine LR scheduler.')
+@click.option('--cos-min-lr',
+              show_default=True,
+              default=RECOGNITION_HYPER_PARAMS['cos_min_lr'],
+              help='Minimal final learning rate for cosine LR scheduler.')
 @click.option('-p', '--partition', show_default=True, default=0.9,
               help='Ground truth data partition ratio between train/validation set')
 @click.option('--fixed-splits/--ignore-fixed-splits', show_default=True, default=False,
@@ -183,7 +187,7 @@
 def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
              min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum,
              weight_decay, warmup, schedule, gamma, step_size, sched_patience,
-             cos_max, partition, fixed_splits, training_files,
+             cos_max, cos_min_lr, partition, fixed_splits, training_files,
              evaluation_files, workers, threads, load_hyper_parameters, repolygonize,
              force_binarization, format_type, augment,
              mask_probability, mask_width, num_negatives, logit_temp,
@@ -227,6 +231,7 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
                          'step_size': step_size,
                          'rop_patience': sched_patience,
                          'cos_t_max': cos_max,
+                         'cos_min_lr': cos_min_lr,
                          'augment': augment,
                          'mask_prob': mask_probability,
                          'mask_width': mask_width,

diff --git a/kraken/ketos/recognition.py b/kraken/ketos/recognition.py
@@ -127,6 +127,10 @@
               show_default=True,
               default=RECOGNITION_HYPER_PARAMS['cos_t_max'],
               help='Epoch of minimal learning rate for cosine LR scheduler.')
+@click.option('--cos-min-lr',
+              show_default=True,
+              default=RECOGNITION_HYPER_PARAMS['cos_min_lr'],
+              help='Minimal final learning rate for cosine LR scheduler.')
 @click.option('-p', '--partition', show_default=True, default=0.9,
               help='Ground truth data partition ratio between train/validation set')
 @click.option('--fixed-splits/--ignore-fixed-split', show_default=True, default=False,
@@ -194,13 +198,14 @@
 @click.argument('ground_truth', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False))
 @click.option('--legacy-polygons', show_default=True, default=False, is_flag=True, help='Use the legacy polygon extractor.')
 def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
-          min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum,
-          weight_decay, warmup, freeze_backbone, schedule, gamma, step_size,
-          sched_patience, cos_max, partition, fixed_splits, normalization,
-          normalize_whitespace, codec, resize, reorder, base_dir,
-          training_files, evaluation_files, workers, threads, load_hyper_parameters,
-          repolygonize, force_binarization, format_type, augment,
-          pl_logger, log_dir, ground_truth, legacy_polygons):
+          min_epochs, lag, min_delta, device, precision, optimizer, lrate,
+          momentum, weight_decay, warmup, freeze_backbone, schedule, gamma,
+          step_size, sched_patience, cos_max, cos_min_lr, partition,
+          fixed_splits, normalization, normalize_whitespace, codec, resize,
+          reorder, base_dir, training_files, evaluation_files, workers,
+          threads, load_hyper_parameters, repolygonize, force_binarization,
+          format_type, augment, pl_logger, log_dir, ground_truth,
+          legacy_polygons):
     """
     Trains a model from image-text pairs.
     """
@@ -253,6 +258,7 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
                          'step_size': step_size,
                          'rop_patience': sched_patience,
                          'cos_t_max': cos_max,
+                         'cos_min_lr': cos_min_lr,
                          'normalization': normalization,
                          'normalize_whitespace': normalize_whitespace,
                          'augment': augment,

diff --git a/kraken/ketos/ro.py b/kraken/ketos/ro.py
@@ -115,6 +115,10 @@
               show_default=True,
               default=READING_ORDER_HYPER_PARAMS['cos_t_max'],
               help='Epoch of minimal learning rate for cosine LR scheduler.')
+@click.option('--cos-min-lr',
+              show_default=True,
+              default=RECOGNITION_HYPER_PARAMS['cos_min_lr'],
+              help='Minimal final learning rate for cosine LR scheduler.')
 @click.option('-p', '--partition', show_default=True, default=0.9,
               help='Ground truth data partition ratio between train/validation set')
 @click.option('-t', '--training-files', show_default=True, default=None, multiple=True,
@@ -143,9 +147,9 @@
 def rotrain(ctx, batch_size, output, load, freq, quit, epochs, min_epochs, lag,
             min_delta, device, precision, optimizer, lrate, momentum,
             weight_decay, warmup, schedule, gamma, step_size, sched_patience,
-            cos_max, partition, training_files, evaluation_files, workers,
-            threads, load_hyper_parameters, format_type, pl_logger, log_dir,
-            level, reading_order, ground_truth):
+            cos_max, cos_min_lr, partition, training_files, evaluation_files,
+            workers, threads, load_hyper_parameters, format_type, pl_logger,
+            log_dir, level, reading_order, ground_truth):
     """
     Trains a baseline labeling model for layout analysis
     """
@@ -189,6 +193,7 @@ def rotrain(ctx, batch_size, output, load, freq, quit, epochs, min_epochs, lag,
                          'step_size': step_size,
                          'rop_patience': sched_patience,
                          'cos_t_max': cos_max,
+                         'cos_min_lr': cos_min_lr,
                          'pl_logger': pl_logger,
                          }
                         )

diff --git a/kraken/ketos/segmentation.py b/kraken/ketos/segmentation.py
@@ -151,6 +151,10 @@ def _validate_merging(ctx, param, value):
               show_default=True,
               default=SEGMENTATION_HYPER_PARAMS['cos_t_max'],
               help='Epoch of minimal learning rate for cosine LR scheduler.')
+@click.option('--cos-min-lr',
+              show_default=True,
+              default=RECOGNITION_HYPER_PARAMS['cos_min_lr'],
+              help='Minimal final learning rate for cosine LR scheduler.')
 @click.option('-p', '--partition', show_default=True, default=0.9,
               help='Ground truth data partition ratio between train/validation set')
 @click.option('-t', '--training-files', show_default=True, default=None, multiple=True,
@@ -226,12 +230,12 @@ def _validate_merging(ctx, param, value):
 def segtrain(ctx, output, spec, line_width, pad, load, freq, quit, epochs,
              min_epochs, lag, min_delta, device, precision, optimizer, lrate,
              momentum, weight_decay, warmup, schedule, gamma, step_size,
-             sched_patience, cos_max, partition, training_files,
+             sched_patience, cos_max, cos_min_lr, partition, training_files,
              evaluation_files, workers, threads, load_hyper_parameters,
              force_binarization, format_type, suppress_regions,
              suppress_baselines, valid_regions, valid_baselines, merge_regions,
-             merge_baselines, bounding_regions,
-             augment, resize, topline, pl_logger, log_dir, ground_truth):
+             merge_baselines, bounding_regions, augment, resize, topline,
+             pl_logger, log_dir, ground_truth):
     """
     Trains a baseline labeling model for layout analysis
     """
@@ -285,6 +289,7 @@ def segtrain(ctx, output, spec, line_width, pad, load, freq, quit, epochs,
                          'step_size': step_size,
                          'rop_patience': sched_patience,
                          'cos_t_max': cos_max,
+                         'cos_min_lr': cos_min_lr,
                          })
 
     # disable automatic partition when given evaluation set explicitly

diff --git a/kraken/lib/default_specs.py b/kraken/lib/default_specs.py
@@ -40,6 +40,7 @@
                               'rop_patience': 5,
                               # cosine
                               'cos_t_max': 100,
+                              'cos_min_lr': 0.001,
                               'warmup': 0,
                               }
 
@@ -67,6 +68,7 @@
                                      'rop_patience': 5,
                                      # cosine
                                      'cos_t_max': 100,
+                                     'cos_min_lr': 1e-7,
                                      # masking parameters
                                      'mask_width': 4,
                                      'mask_prob': 0.5,
@@ -101,6 +103,7 @@
                             'rop_patience': 5,
                             # cosine
                             'cos_t_max': 50,
+                            'cos_min_lr': 1e-4,
                             'warmup': 0,
                             'freeze_backbone': 0,
                             }
@@ -129,5 +132,6 @@
                              'rop_patience': 5,
                              # cosine
                              'cos_t_max': 50,
+                             'cos_min_r': 2e-5,
                              'warmup': 0,
                              }
diff --git a/kraken/lib/train.py b/kraken/lib/train.py
@@ -1129,6 +1129,8 @@ def _configure_optimizer_and_lr_scheduler(hparams, params, len_train_set=None, l
     weight_decay = hparams.get("weight_decay")
     schedule = hparams.get("schedule")
     gamma = hparams.get("gamma")
+    cos_t_max = hparams.get("cos_t_max")
+    cos_min_lr = hparams.get("cos_min_lr")
     step_size = hparams.get("step_size")
     rop_factor = hparams.get("rop_factor")
     rop_patience = hparams.get("rop_patience")
@@ -1149,7 +1151,10 @@ def _configure_optimizer_and_lr_scheduler(hparams, params, len_train_set=None, l
         lr_sched = {'scheduler': lr_scheduler.ExponentialLR(optim, gamma, last_epoch=completed_epochs-1),
                     'interval': 'step'}
     elif schedule == 'cosine':
-        lr_sched = {'scheduler': lr_scheduler.CosineAnnealingLR(optim, gamma, last_epoch=completed_epochs-1),
+        lr_sched = {'scheduler': lr_scheduler.CosineAnnealingLR(optim,
+                                                                cos_t_max,
+                                                                cos_min_lr,
+                                                                last_epoch=completed_epochs-1),
                     'interval': 'step'}
     elif schedule == 'step':
         lr_sched = {'scheduler': lr_scheduler.StepLR(optim, step_size, gamma, last_epoch=completed_epochs-1),