From 95981e0bcd354f37e2df7d3d07d40ebefc426400 Mon Sep 17 00:00:00 2001
From: Benjamin Kiessling <mittagessen@l.unchti.me>
Date: Fri, 19 Jan 2024 01:14:17 +0100
Subject: [PATCH] Move variable types from comments to annotations

---
 docs/advanced.rst                  |   8 +-
 kraken/ketos/segmentation.py       |   2 +-
 kraken/kraken.py                   |   8 +-
 kraken/lib/codec.py                |   4 +-
 kraken/lib/dataset/recognition.py  |  20 ++---
 kraken/lib/dataset/segmentation.py |   3 +-
 kraken/lib/dataset/utils.py        |   2 +-
 kraken/lib/layers.py               |   4 +-
 kraken/lib/lstm.py                 | 126 -----------------------------
 kraken/lib/vgsl.py                 |  30 +++----
 kraken/pageseg.py                  |   6 +-
 kraken/serialization.py            |  18 ++---
 kraken/transcribe.py               |  11 +--
 tests/test_models.py               |   2 -
 14 files changed, 58 insertions(+), 186 deletions(-)
 delete mode 100644 kraken/lib/lstm.py

diff --git a/docs/advanced.rst b/docs/advanced.rst
index b0e232b6c..533e1280f 100644
--- a/docs/advanced.rst
+++ b/docs/advanced.rst
@@ -451,15 +451,15 @@ segmentation and a mapping between scripts and models:
 
 .. code-block:: console
 
-        $ kraken -i ... ... ocr -m Grek:porson.clstm -m Latn:antiqua.clstm
+        $ kraken -i ... ... ocr -m Grek:porson.mlmodel -m Latn:antiqua.mlmodel
 
-All polytonic Greek text portions will be recognized using the `porson.clstm`
-model while Latin text will be fed into the `antiqua.clstm` model. It is
+All polytonic Greek text portions will be recognized using the `porson.mlmodel`
+model while Latin text will be fed into the `antiqua.mlmodel` model. It is
 possible to define a fallback model that other text will be fed to:
 
 .. code-block:: console
 
-        $ kraken -i ... ... ocr -m ... -m ... -m default:porson.clstm
+        $ kraken -i ... ... ocr -m ... -m ... -m default:porson.mlmodel
 
 It is also possible to disable recognition on a particular script by mapping to
 the special model keyword `ignore`. Ignored lines will still be serialized but
diff --git a/kraken/ketos/segmentation.py b/kraken/ketos/segmentation.py
index c144e9fbd..addd5ebbf 100644
--- a/kraken/ketos/segmentation.py
+++ b/kraken/ketos/segmentation.py
@@ -45,7 +45,7 @@ def _validate_merging(ctx, param, value):
     """
     if not value:
         return None
-    merge_dict = {}  # type: Dict[str, str]
+    merge_dict: Dict[str, str] = {}
     try:
         for m in value:
             lexer = shlex.shlex(m, posix=True)
diff --git a/kraken/kraken.py b/kraken/kraken.py
index 633d7bdfe..ccb98e8a2 100644
--- a/kraken/kraken.py
+++ b/kraken/kraken.py
@@ -30,7 +30,7 @@
 
 from functools import partial
 from rich.traceback import install
-from typing import Dict, cast, Any, IO, Callable
+from typing import Dict, cast, Any, IO, Callable, Union, List
 
 import click
 
@@ -541,7 +541,7 @@ def _validate_mm(ctx, param, value):
     """
     Maps model mappings to a dictionary.
     """
-    model_dict = {'ignore': []}  # type: Dict[str, Union[str, List[str]]]
+    model_dict: Dict[str, Union[str, List[str]]] = {'ignore': []}
     if len(value) == 1:
         lexer = shlex.shlex(value[0], posix=True)
         lexer.wordchars += r'\/.+-()=^&;,.'
@@ -603,7 +603,7 @@ def ocr(ctx, model, pad, reorder, base_dir, no_segmentation, text_direction):
         reorder = base_dir
 
     # first we try to find the model in the absolute path, then ~/.kraken
-    nm = {}  # type: Dict[str, models.TorchSeqRecognizer]
+    nm: Dict[str, models.TorchSeqRecognizer] = {}
     ign_tags = model.pop('ignore')
     for k, v in model.items():
         search = [v,
@@ -629,7 +629,7 @@ def ocr(ctx, model, pad, reorder, base_dir, no_segmentation, text_direction):
     if 'default' in nm:
         from collections import defaultdict
 
-        nn = defaultdict(lambda: nm['default'])  # type: Dict[str, models.TorchSeqRecognizer]
+        nn: Dict[str, models.TorchSeqRecognizer] = defaultdict(lambda: nm['default'])
         nn.update(nm)
         nm = nn
 
diff --git a/kraken/lib/codec.py b/kraken/lib/codec.py
index a72e90da6..8eb911441 100644
--- a/kraken/lib/codec.py
+++ b/kraken/lib/codec.py
@@ -64,7 +64,7 @@ def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str],
                 raise KrakenCodecException(f'Duplicate entry in codec definition string: {cc}')
             self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)}
         self.c_sorted = sorted(self.c2l.keys(), key=len, reverse=True)
-        self.l2c = {tuple(v): k for k, v in self.c2l.items()}  # type: Dict[Tuple[int], str]
+        self.l2c: Dict[Tuple[int], str] = {tuple(v): k for k, v in self.c2l.items()}
         self.l2c_single = {k[0]: v for k, v in self.l2c.items() if len(k) == 1}
         self.strict = strict
         if not self.is_valid:
@@ -116,7 +116,7 @@ def encode(self, s: str) -> IntTensor:
             KrakenEncodeException: if the a subsequence is not encodable and the
                                    codec is set to strict mode.
         """
-        labels = []  # type: List[int]
+        labels: List[int] = []
         idx = 0
         while idx < len(s):
             encodable_suffix = False
diff --git a/kraken/lib/dataset/recognition.py b/kraken/lib/dataset/recognition.py
index 529149810..16d00fa0e 100644
--- a/kraken/lib/dataset/recognition.py
+++ b/kraken/lib/dataset/recognition.py
@@ -112,8 +112,8 @@ def __init__(self,
                           `test` only rows with the appropriate flag set in the
                           file will be considered.
         """
-        self.alphabet = Counter()  # type: Counter
-        self.text_transforms = []  # type: List[Callable[[str], str]]
+        self.alphabet: Counter = Counter()
+        self.text_transforms: List[Callable[[str], str]] = []
         self.failed_samples = set()
         self.transforms = im_transforms
         self.aug = None
@@ -300,10 +300,10 @@ def __init__(self,
                            suitable for forward passes.
             augmentation: Enables augmentation.
         """
-        self._images = []  # type:  Union[List[Image], List[torch.Tensor]]
-        self._gt = []  # type:  List[str]
-        self.alphabet = Counter()  # type: Counter
-        self.text_transforms = []  # type: List[Callable[[str], str]]
+        self._images: Union[List[Image.Image], List[torch.Tensor]] = []
+        self._gt: List[str] = []
+        self.alphabet: Counter = Counter()
+        self.text_transforms: List[Callable[[str], str]] = []
         self.transforms = im_transforms
         self.aug = None
         self.skip_empty_lines = skip_empty_lines
@@ -397,7 +397,7 @@ def encode(self, codec: Optional[PytorchCodec] = None) -> None:
             self.codec = codec
         else:
             self.codec = PytorchCodec(''.join(self.alphabet.keys()))
-        self.training_set = []  # type: List[Tuple[Union[Image, torch.Tensor], torch.Tensor]]
+        self.training_set: List[Tuple[Union[Image.Image, torch.Tensor], torch.Tensor]] = []
         for im, gt in zip(self._images, self._gt):
             self.training_set.append((im, self.codec.encode(gt)))
 
@@ -405,7 +405,7 @@ def no_encode(self) -> None:
         """
         Creates an unencoded dataset.
         """
-        self.training_set = []  # type: List[Tuple[Union[Image, torch.Tensor], str]]
+        self.training_set: List[Tuple[Union[Image.Image, torch.Tensor], str]] = []
         for im, gt in zip(self._images, self._gt):
             self.training_set.append((im, gt))
 
@@ -584,7 +584,7 @@ def encode(self, codec: Optional[PytorchCodec] = None) -> None:
             self.codec = codec
         else:
             self.codec = PytorchCodec(''.join(self.alphabet.keys()))
-        self.training_set = []  # type: List[Tuple[Union[Image, torch.Tensor], torch.Tensor]]
+        self.training_set: List[Tuple[Union[Image.Image, torch.Tensor], torch.Tensor]] = []
         for im, gt in zip(self._images, self._gt):
             self.training_set.append((im, self.codec.encode(gt)))
 
@@ -592,7 +592,7 @@ def no_encode(self) -> None:
         """
         Creates an unencoded dataset.
         """
-        self.training_set = []  # type: List[Tuple[Union[Image, torch.Tensor], str]]
+        self.training_set: List[Tuple[Union[Image.Image, torch.Tensor], str]] = []
         for im, gt in zip(self._images, self._gt):
             self.training_set.append((im, gt))
 
diff --git a/kraken/lib/dataset/segmentation.py b/kraken/lib/dataset/segmentation.py
index 84d697a49..61a77e4eb 100644
--- a/kraken/lib/dataset/segmentation.py
+++ b/kraken/lib/dataset/segmentation.py
@@ -27,7 +27,7 @@
 from torchvision import transforms
 from collections import defaultdict
 from torch.utils.data import Dataset
-from typing import Dict, Tuple, Sequence, Callable, Any, Union, Literal, Optional, TYPE_CHECKING
+from typing import Dict, Tuple, Sequence, Callable, Any, TYPE_CHECKING
 
 from skimage.draw import polygon
 
@@ -35,7 +35,6 @@
 
 if TYPE_CHECKING:
     from kraken.containers import Segmentation
-    from kraken.lib.xml import XMLPage
 
 
 __all__ = ['BaselineSet']
diff --git a/kraken/lib/dataset/utils.py b/kraken/lib/dataset/utils.py
index b690fdabf..98defd9d9 100644
--- a/kraken/lib/dataset/utils.py
+++ b/kraken/lib/dataset/utils.py
@@ -66,7 +66,7 @@ def __init__(self,
         """
         super().__init__(None)
 
-        self._scale = (height, width)  # type: Tuple[int, int]
+        self._scale: Tuple[int, int] = (height, width)
         self._valid_norm = valid_norm
         self._force_binarization = force_binarization
         self._batch = batch
diff --git a/kraken/lib/layers.py b/kraken/lib/layers.py
index 76f069574..38b25aa1b 100644
--- a/kraken/lib/layers.py
+++ b/kraken/lib/layers.py
@@ -46,7 +46,7 @@ def forward(self, *inputs, output_shape: Optional[Tuple[int, int]] = None):
         outputs = []
         seq_lens = None
         for module in self._modules.values():
-            if type(inputs) == tuple:
+            if isinstance(inputs, tuple):
                 output, seq_lens = module(*inputs, output_shape=output_shape)
                 outputs.append(output)
             else:
@@ -135,7 +135,7 @@ def __init__(self, input_size: int, hidden_size: int) -> None:
 
         self.input_size = input_size
         self.hidden_size = hidden_size
-        self._all_weights = []  # type: List[List[str]]
+        self._all_weights: List[List[str]] = []
         gate_size = 4 * hidden_size
         for direction in range(2):
             w_ih = torch.nn.Parameter(torch.Tensor(gate_size, input_size))
diff --git a/kraken/lib/lstm.py b/kraken/lib/lstm.py
deleted file mode 100644
index 565803051..000000000
--- a/kraken/lib/lstm.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# flake8: noqa
-from typing import Dict
-from scipy.special import expit
-
-initial_range = 0.1
-
-
-class Codec(object):
-    """Translate between integer codes and characters."""
-    def init(self, charset):
-        charset = sorted(list(set(charset)))
-        self.code2char = {}  # type: Dict[int, str]
-        self.char2code = {}  # type:  Dict[str, int]
-        for code,char in enumerate(charset):
-            self.code2char[code] = char
-            self.char2code[char] = code
-        return self
-    def size(self):
-        """The total number of codes (use this for the number of output
-        classes when training a classifier."""
-        return len(list(self.code2char.keys()))
-    def encode(self, s):
-        "Encode the string `s` into a code sequence."
-        dflt = self.char2code["~"]
-        return [self.char2code.get(c,dflt) for c in s]
-    def decode(self, l):
-        "Decode a code sequence into a string."
-        s = [self.code2char.get(c,"~") for c in l]
-        return s
-
-class Network:
-    def predict(self,xs):
-        """Prediction is the same as forward propagation."""
-        return self.forward(xs)
-
-class Softmax(Network):
-    """A logistic regression network."""
-    def __init__(self,Nh,No,initial_range=0.1,rand=None):
-        pass
-    def ninputs(self):
-        pass
-    def noutputs(self):
-        pass
-    def forward(self,ys):
-        pass
-    def backward(self,deltas):
-        pass
-
-
-class LSTM(Network):
-    """A standard LSTM network. This is a direct implementation of all the forward
-    and backward propagation formulas, mainly for speed. (There is another, more
-    abstract implementation as well, but that's significantly slower in Python
-    due to function call overhead.)"""
-    def __init__(self,ni,ns,initial=0.1,maxlen=5000):
-        pass
-
-    def init_weights(self,initial):
-        pass
-
-    def allocate(self,n):
-        pass
-
-    def reset(self,n):
-        pass
-
-    def forward(self,xs):
-        pass
-
-################################################################
-# combination classifiers
-################################################################
-
-class Stacked(Network):
-    """Stack two networks on top of each other."""
-    def __init__(self,nets):
-        self.nets = nets
-    def forward(self,xs):
-        pass
-
-class Reversed(Network):
-    """Run a network on the time-reversed input."""
-    def __init__(self,net):
-        self.net = net
-    def forward(self,xs):
-        pass
-
-class Parallel(Network):
-    """Run multiple networks in parallel on the same input."""
-    def __init__(self,*nets):
-        self.nets = nets
-    def forward(self,xs):
-        pass
-
-def BIDILSTM(Ni,Ns,No):
-    """A bidirectional LSTM, constructed from regular and reversed LSTMs."""
-    lstm1 = LSTM(Ni,Ns)
-    lstm2 = Reversed(LSTM(Ni,Ns))
-    bidi = Parallel(lstm1,lstm2)
-    logreg = Softmax(2*Ns,No)
-    stacked = Stacked([bidi,logreg])
-    return stacked
-
-
-class SeqRecognizer(Network):
-    """Perform sequence recognition using BIDILSTM and alignment."""
-    def __init__(self,ninput,nstates,noutput=-1,codec=None,normalize=None):
-        self.Ni = ninput
-        if codec: noutput = codec.size()
-        self.No = noutput
-        self.lstm = BIDILSTM(ninput,nstates,noutput)
-        self.codec = codec
-    def translate_back(self, output):
-        pass
-    def translate_back_locations(self, output):
-        pass
-    def predictSequence(self,xs):
-        "Predict an integer sequence of codes."
-        pass
-    def l2s(self,l):
-        "Convert a code sequence into a unicode string after recognition."
-        l = self.codec.decode(l)
-        return u"".join(l)
-    def predictString(self,xs):
-        "Predict output as a string. This uses codec and normalizer."
-        pass
diff --git a/kraken/lib/vgsl.py b/kraken/lib/vgsl.py
index 2bc83d693..4335fe7e4 100644
--- a/kraken/lib/vgsl.py
+++ b/kraken/lib/vgsl.py
@@ -127,21 +127,21 @@ def __init__(self, spec: str) -> None:
                     dimension.
         """
         self.spec = spec
-        self.named_spec = []  # type:  List[str]
+        self.named_spec: List[str] = []
         self.ops = [self.build_addition, self.build_identity, self.build_rnn,
                     self.build_dropout, self.build_maxpool, self.build_conv,
                     self.build_output, self.build_reshape, self.build_wav2vec2,
                     self.build_groupnorm, self.build_series,
                     self.build_parallel, self.build_ro]
-        self.codec = None  # type: Optional[PytorchCodec]
-        self.criterion = None  # type: Any
+        self.codec: Optional[PytorchCodec] = None
+        self.criterion: Any = None
         self.nn = layers.MultiParamSequential()
-        self.user_metadata = {'accuracy': [],
-                              'metrics': [],
-                              'seg_type': None,
-                              'one_channel_mode': None,
-                              'model_type': None,
-                              'hyper_params': {}}  # type: dict[str, Any]
+        self.user_metadata: Dict[str, Any] = {'accuracy': [],
+                                              'metrics': [],
+                                              'seg_type': None,
+                                              'one_channel_mode': None,
+                                              'model_type': None,
+                                              'hyper_params': {}}
         self._aux_layers = nn.ModuleDict()
 
         self.idx = -1
@@ -304,12 +304,12 @@ def _deserialize_layers(name, layer):
         if 'codec' in mlmodel.user_defined_metadata:
             nn.add_codec(PytorchCodec(json.loads(mlmodel.user_defined_metadata['codec'])))
 
-        nn.user_metadata = {'accuracy': [],
-                            'metrics': [],
-                            'seg_type': 'bbox',
-                            'one_channel_mode': '1',
-                            'model_type': None,
-                            'hyper_params': {}}  # type: dict[str, str]
+        nn.user_metadata: Dict[str, Any] = {'accuracy': [],
+                                            'metrics': [],
+                                            'seg_type': 'bbox',
+                                            'one_channel_mode': '1',
+                                            'model_type': None,
+                                            'hyper_params': {}}
 
         if 'kraken_meta' in mlmodel.user_defined_metadata:
             nn.user_metadata.update(json.loads(mlmodel.user_defined_metadata['kraken_meta']))
diff --git a/kraken/pageseg.py b/kraken/pageseg.py
index 6cc94ea31..82ab81c0d 100644
--- a/kraken/pageseg.py
+++ b/kraken/pageseg.py
@@ -46,9 +46,9 @@ class record(object):
     """
     def __init__(self, **kw):
         self.__dict__.update(kw)
-        self.label = 0  # type: int
-        self.bounds = []  # type: List
-        self.mask = None  # type: np.ndarray
+        self.label: int = 0
+        self.bounds: List = []
+        self.mask: np.ndarray = None
 
 
 def find(condition):
diff --git a/kraken/serialization.py b/kraken/serialization.py
index 94e490191..e1392feb0 100644
--- a/kraken/serialization.py
+++ b/kraken/serialization.py
@@ -22,7 +22,7 @@
 
 from kraken.lib.util import make_printable
 
-from typing import List, Tuple, Iterable, Optional, Sequence, Literal, TYPE_CHECKING
+from typing import List, Tuple, Iterable, Optional, Sequence, Literal, TYPE_CHECKING, Dict, Any
 
 if TYPE_CHECKING:
     from os import PathLike
@@ -107,14 +107,14 @@ def serialize(results: 'Segmentation',
         The rendered template
     """
     logger.info(f'Serialize {len(results.lines)} records from {results.imagename} with template {template}.')
-    page = {'entities': [],
-            'size': image_size,
-            'name': results.imagename,
-            'writing_mode': writing_mode,
-            'scripts': scripts,
-            'date': datetime.datetime.now(datetime.timezone.utc).isoformat(),
-            'base_dir': [rec.base_dir for rec in results.lines][0] if len(results.lines) else None,
-            'seg_type': results.type}  # type: dict
+    page: Dict[str, Any] = {'entities': [],
+                            'size': image_size,
+                            'name': results.imagename,
+                            'writing_mode': writing_mode,
+                            'scripts': scripts,
+                            'date': datetime.datetime.now(datetime.timezone.utc).isoformat(),
+                            'base_dir': [rec.base_dir for rec in results.lines][0] if len(results.lines) else None,
+                            'seg_type': results.type}
     metadata = {'processing_steps': processing_steps,
                 'version': importlib.metadata.version('kraken')}
 
diff --git a/kraken/transcribe.py b/kraken/transcribe.py
index 5b39ee2f7..6a6ba0a68 100644
--- a/kraken/transcribe.py
+++ b/kraken/transcribe.py
@@ -15,16 +15,17 @@
 """
 Utility functions for ground truth transcription.
 """
-from kraken.lib.exceptions import KrakenInputException
-from kraken.lib.util import get_im_str
-
-from jinja2 import Environment, PackageLoader
 from io import BytesIO
+from typing import List, Dict, Any
+from jinja2 import Environment, PackageLoader
 
 import uuid
 import base64
 import logging
 
+from kraken.lib.exceptions import KrakenInputException
+from kraken.lib.util import get_im_str
+
 logger = logging.getLogger()
 
 
@@ -36,7 +37,7 @@ def __init__(self, font=None, font_style=None):
         env = Environment(loader=PackageLoader('kraken', 'templates'), autoescape=True)
         logger.debug('Loading transcription template.')
         self.tmpl = env.get_template('layout.html')
-        self.pages = []  # type: List[dict]
+        self.pages: List[Dict[Any, Any]] = []
         self.font = {'font': font, 'style': font_style}
         self.text_direction = 'horizontal-tb'
         self.page_idx = 1
diff --git a/tests/test_models.py b/tests/test_models.py
index cb57b05cd..f928d4fcb 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -7,8 +7,6 @@
 from pytest import raises
 from pathlib import Path
 
-import kraken.lib.lstm
-
 from kraken.lib import models
 from kraken.lib.exceptions import KrakenInvalidModelException