Move variable types from comments to annotations

mittagessen · Jan 19, 2024 · 95981e0 · 95981e0
1 parent aea5098
commit 95981e0
Show file tree

Hide file tree

Showing 14 changed files with 58 additions and 186 deletions.
diff --git a/docs/advanced.rst b/docs/advanced.rst
@@ -451,15 +451,15 @@ segmentation and a mapping between scripts and models:
 
 .. code-block:: console
 
-        $ kraken -i ... ... ocr -m Grek:porson.clstm -m Latn:antiqua.clstm
+        $ kraken -i ... ... ocr -m Grek:porson.mlmodel -m Latn:antiqua.mlmodel
 
-All polytonic Greek text portions will be recognized using the `porson.clstm`
-model while Latin text will be fed into the `antiqua.clstm` model. It is
+All polytonic Greek text portions will be recognized using the `porson.mlmodel`
+model while Latin text will be fed into the `antiqua.mlmodel` model. It is
 possible to define a fallback model that other text will be fed to:
 
 .. code-block:: console
 
-        $ kraken -i ... ... ocr -m ... -m ... -m default:porson.clstm
+        $ kraken -i ... ... ocr -m ... -m ... -m default:porson.mlmodel
 
 It is also possible to disable recognition on a particular script by mapping to
 the special model keyword `ignore`. Ignored lines will still be serialized but

diff --git a/kraken/ketos/segmentation.py b/kraken/ketos/segmentation.py
@@ -45,7 +45,7 @@ def _validate_merging(ctx, param, value):
     """
     if not value:
         return None
-    merge_dict = {}  # type: Dict[str, str]
+    merge_dict: Dict[str, str] = {}
     try:
         for m in value:
             lexer = shlex.shlex(m, posix=True)

diff --git a/kraken/kraken.py b/kraken/kraken.py
@@ -30,7 +30,7 @@
 
 from functools import partial
 from rich.traceback import install
-from typing import Dict, cast, Any, IO, Callable
+from typing import Dict, cast, Any, IO, Callable, Union, List
 
 import click
 
@@ -541,7 +541,7 @@ def _validate_mm(ctx, param, value):
     """
     Maps model mappings to a dictionary.
     """
-    model_dict = {'ignore': []}  # type: Dict[str, Union[str, List[str]]]
+    model_dict: Dict[str, Union[str, List[str]]] = {'ignore': []}
     if len(value) == 1:
         lexer = shlex.shlex(value[0], posix=True)
         lexer.wordchars += r'\/.+-()=^&;,.'
@@ -603,7 +603,7 @@ def ocr(ctx, model, pad, reorder, base_dir, no_segmentation, text_direction):
         reorder = base_dir
 
     # first we try to find the model in the absolute path, then ~/.kraken
-    nm = {}  # type: Dict[str, models.TorchSeqRecognizer]
+    nm: Dict[str, models.TorchSeqRecognizer] = {}
     ign_tags = model.pop('ignore')
     for k, v in model.items():
         search = [v,
@@ -629,7 +629,7 @@ def ocr(ctx, model, pad, reorder, base_dir, no_segmentation, text_direction):
     if 'default' in nm:
         from collections import defaultdict
 
-        nn = defaultdict(lambda: nm['default'])  # type: Dict[str, models.TorchSeqRecognizer]
+        nn: Dict[str, models.TorchSeqRecognizer] = defaultdict(lambda: nm['default'])
         nn.update(nm)
         nm = nn
 

diff --git a/kraken/lib/codec.py b/kraken/lib/codec.py
@@ -64,7 +64,7 @@ def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str],
                 raise KrakenCodecException(f'Duplicate entry in codec definition string: {cc}')
             self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)}
         self.c_sorted = sorted(self.c2l.keys(), key=len, reverse=True)
-        self.l2c = {tuple(v): k for k, v in self.c2l.items()}  # type: Dict[Tuple[int], str]
+        self.l2c: Dict[Tuple[int], str] = {tuple(v): k for k, v in self.c2l.items()}
         self.l2c_single = {k[0]: v for k, v in self.l2c.items() if len(k) == 1}
         self.strict = strict
         if not self.is_valid:
@@ -116,7 +116,7 @@ def encode(self, s: str) -> IntTensor:
             KrakenEncodeException: if the a subsequence is not encodable and the
                                    codec is set to strict mode.
         """
-        labels = []  # type: List[int]
+        labels: List[int] = []
         idx = 0
         while idx < len(s):
             encodable_suffix = False

diff --git a/kraken/lib/dataset/recognition.py b/kraken/lib/dataset/recognition.py
@@ -112,8 +112,8 @@ def __init__(self,
                           `test` only rows with the appropriate flag set in the
                           file will be considered.
         """
-        self.alphabet = Counter()  # type: Counter
-        self.text_transforms = []  # type: List[Callable[[str], str]]
+        self.alphabet: Counter = Counter()
+        self.text_transforms: List[Callable[[str], str]] = []
         self.failed_samples = set()
         self.transforms = im_transforms
         self.aug = None
@@ -300,10 +300,10 @@ def __init__(self,
                            suitable for forward passes.
             augmentation: Enables augmentation.
         """
-        self._images = []  # type:  Union[List[Image], List[torch.Tensor]]
-        self._gt = []  # type:  List[str]
-        self.alphabet = Counter()  # type: Counter
-        self.text_transforms = []  # type: List[Callable[[str], str]]
+        self._images: Union[List[Image.Image], List[torch.Tensor]] = []
+        self._gt: List[str] = []
+        self.alphabet: Counter = Counter()
+        self.text_transforms: List[Callable[[str], str]] = []
         self.transforms = im_transforms
         self.aug = None
         self.skip_empty_lines = skip_empty_lines
@@ -397,15 +397,15 @@ def encode(self, codec: Optional[PytorchCodec] = None) -> None:
             self.codec = codec
         else:
             self.codec = PytorchCodec(''.join(self.alphabet.keys()))
-        self.training_set = []  # type: List[Tuple[Union[Image, torch.Tensor], torch.Tensor]]
+        self.training_set: List[Tuple[Union[Image.Image, torch.Tensor], torch.Tensor]] = []
         for im, gt in zip(self._images, self._gt):
             self.training_set.append((im, self.codec.encode(gt)))
 
     def no_encode(self) -> None:
         """
         Creates an unencoded dataset.
         """
-        self.training_set = []  # type: List[Tuple[Union[Image, torch.Tensor], str]]
+        self.training_set: List[Tuple[Union[Image.Image, torch.Tensor], str]] = []
         for im, gt in zip(self._images, self._gt):
             self.training_set.append((im, gt))
 
@@ -584,15 +584,15 @@ def encode(self, codec: Optional[PytorchCodec] = None) -> None:
             self.codec = codec
         else:
             self.codec = PytorchCodec(''.join(self.alphabet.keys()))
-        self.training_set = []  # type: List[Tuple[Union[Image, torch.Tensor], torch.Tensor]]
+        self.training_set: List[Tuple[Union[Image.Image, torch.Tensor], torch.Tensor]] = []
         for im, gt in zip(self._images, self._gt):
             self.training_set.append((im, self.codec.encode(gt)))
 
     def no_encode(self) -> None:
         """
         Creates an unencoded dataset.
         """
-        self.training_set = []  # type: List[Tuple[Union[Image, torch.Tensor], str]]
+        self.training_set: List[Tuple[Union[Image.Image, torch.Tensor], str]] = []
         for im, gt in zip(self._images, self._gt):
             self.training_set.append((im, gt))
 

diff --git a/kraken/lib/dataset/segmentation.py b/kraken/lib/dataset/segmentation.py
@@ -27,15 +27,14 @@
 from torchvision import transforms
 from collections import defaultdict
 from torch.utils.data import Dataset
-from typing import Dict, Tuple, Sequence, Callable, Any, Union, Literal, Optional, TYPE_CHECKING
+from typing import Dict, Tuple, Sequence, Callable, Any, TYPE_CHECKING
 
 from skimage.draw import polygon
 
 from kraken.lib.segmentation import scale_regions
 
 if TYPE_CHECKING:
     from kraken.containers import Segmentation
-    from kraken.lib.xml import XMLPage
 
 
 __all__ = ['BaselineSet']

diff --git a/kraken/lib/dataset/utils.py b/kraken/lib/dataset/utils.py
@@ -66,7 +66,7 @@ def __init__(self,
         """
         super().__init__(None)
 
-        self._scale = (height, width)  # type: Tuple[int, int]
+        self._scale: Tuple[int, int] = (height, width)
         self._valid_norm = valid_norm
         self._force_binarization = force_binarization
         self._batch = batch

diff --git a/kraken/lib/layers.py b/kraken/lib/layers.py
@@ -46,7 +46,7 @@ def forward(self, *inputs, output_shape: Optional[Tuple[int, int]] = None):
         outputs = []
         seq_lens = None
         for module in self._modules.values():
-            if type(inputs) == tuple:
+            if isinstance(inputs, tuple):
                 output, seq_lens = module(*inputs, output_shape=output_shape)
                 outputs.append(output)
             else:
@@ -135,7 +135,7 @@ def __init__(self, input_size: int, hidden_size: int) -> None:
 
         self.input_size = input_size
         self.hidden_size = hidden_size
-        self._all_weights = []  # type: List[List[str]]
+        self._all_weights: List[List[str]] = []
         gate_size = 4 * hidden_size
         for direction in range(2):
             w_ih = torch.nn.Parameter(torch.Tensor(gate_size, input_size))

diff --git a/kraken/lib/lstm.py b/kraken/lib/lstm.py
diff --git a/kraken/lib/vgsl.py b/kraken/lib/vgsl.py
@@ -127,21 +127,21 @@ def __init__(self, spec: str) -> None:
                     dimension.
         """
         self.spec = spec
-        self.named_spec = []  # type:  List[str]
+        self.named_spec: List[str] = []
         self.ops = [self.build_addition, self.build_identity, self.build_rnn,
                     self.build_dropout, self.build_maxpool, self.build_conv,
                     self.build_output, self.build_reshape, self.build_wav2vec2,
                     self.build_groupnorm, self.build_series,
                     self.build_parallel, self.build_ro]
-        self.codec = None  # type: Optional[PytorchCodec]
-        self.criterion = None  # type: Any
+        self.codec: Optional[PytorchCodec] = None
+        self.criterion: Any = None
         self.nn = layers.MultiParamSequential()
-        self.user_metadata = {'accuracy': [],
-                              'metrics': [],
-                              'seg_type': None,
-                              'one_channel_mode': None,
-                              'model_type': None,
-                              'hyper_params': {}}  # type: dict[str, Any]
+        self.user_metadata: Dict[str, Any] = {'accuracy': [],
+                                              'metrics': [],
+                                              'seg_type': None,
+                                              'one_channel_mode': None,
+                                              'model_type': None,
+                                              'hyper_params': {}}
         self._aux_layers = nn.ModuleDict()
 
         self.idx = -1
@@ -304,12 +304,12 @@ def _deserialize_layers(name, layer):
         if 'codec' in mlmodel.user_defined_metadata:
             nn.add_codec(PytorchCodec(json.loads(mlmodel.user_defined_metadata['codec'])))
 
-        nn.user_metadata = {'accuracy': [],
-                            'metrics': [],
-                            'seg_type': 'bbox',
-                            'one_channel_mode': '1',
-                            'model_type': None,
-                            'hyper_params': {}}  # type: dict[str, str]
+        nn.user_metadata: Dict[str, Any] = {'accuracy': [],
+                                            'metrics': [],
+                                            'seg_type': 'bbox',
+                                            'one_channel_mode': '1',
+                                            'model_type': None,
+                                            'hyper_params': {}}
 
         if 'kraken_meta' in mlmodel.user_defined_metadata:
             nn.user_metadata.update(json.loads(mlmodel.user_defined_metadata['kraken_meta']))

diff --git a/kraken/pageseg.py b/kraken/pageseg.py
@@ -46,9 +46,9 @@ class record(object):
     """
     def __init__(self, **kw):
         self.__dict__.update(kw)
-        self.label = 0  # type: int
-        self.bounds = []  # type: List
-        self.mask = None  # type: np.ndarray
+        self.label: int = 0
+        self.bounds: List = []
+        self.mask: np.ndarray = None
 
 
 def find(condition):