Skip to content

Commit

Permalink
Boudams now has a mode system, that will allow much more flexibility …
Browse files Browse the repository at this point in the history
…in the future

- Legacy mode is "simple-space"
- A new mode, "advanced-space" allows for training a model on string that have spaces in them

commit 1d06489
Author: Thibault Clérice <[email protected]>
Date:   Tue Apr 12 11:14:54 2022 +0200

    Why not

commit b2541e9
Author: Thibault Clérice <[email protected]>
Date:   Tue Apr 12 11:14:20 2022 +0200

    Removed out-commented data

commit 824325a
Author: Thibault Clérice <[email protected]>
Date:   Tue Apr 12 11:10:15 2022 +0200

    Probably working model tagging

commit 72e8cd9
Author: Thibault Clérice <[email protected]>
Date:   Tue Apr 12 09:41:12 2022 +0200

    Mode AdvancedSpace is working, need to see at training time now

commit ec9904c
Author: Thibault Clérice <[email protected]>
Date:   Mon Apr 11 16:54:40 2022 +0200

    Working SimpleSpaceMode

commit c110f52
Author: Thibault Clérice <[email protected]>
Date:   Mon Apr 11 16:21:10 2022 +0200

    Add gitignore

commit 2580abe
Merge: 42662eb 68a4ee6
Author: Thibault Clérice <[email protected]>
Date:   Mon Apr 11 16:19:15 2022 +0200

    Merge branch '1.0.0/new-data-formats' of github.com:PonteIneptique/boudams into 1.0.0/new-data-formats

commit 42662eb
Author: Thibault Clérice <[email protected]>
Date:   Mon Apr 11 16:17:31 2022 +0200

    [WIP] Cli should be working

commit a1ba7b3
Author: Thibault Clérice <[email protected]>
Date:   Mon Apr 11 15:29:13 2022 +0200

    [WIP] Working on splitter for data generation

commit 240158c
Author: Thibault Clérice <[email protected]>
Date:   Tue Apr 5 17:01:25 2022 +0200

    [WIP] Moving the mask mechanism to a new Mode class

commit 68a4ee6
Author: Thibault Clérice <[email protected]>
Date:   Tue Apr 5 17:01:25 2022 +0200

    [WIP] Moving the mask mechanism to a new Mode class
  • Loading branch information
PonteIneptique committed Apr 12, 2022
1 parent 149c9a1 commit d8ed1d5
Show file tree
Hide file tree
Showing 26 changed files with 123,383 additions and 511 deletions.
28 changes: 28 additions & 0 deletions CustomVGSL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Custom Architecture Building String

*This is following the example of [VGSL specs](https://tesseract-ocr.github.io/tessdoc/tess4/VGSLSpecs.html).*

The new spec system is built around custom architecture strings.

Available modules:

- `C[A]<x>,<d>` uses a convolutional layer where `x` is the n-gram window and `d` the output.
- `CP[A]<x>,<d>` uses a convolutional layer with positional embeddings where `x` is the n-gram window and `d` the output.
- `L[A]<h>,<l>` uses a Bi-LSTM layer where `h` is the hidden size and `l` the number of layers.
- `G[A]<h>,<l>` uses a Bi-GRU layer where `h` is the hidden size and `l` the number of layers.
- `D<r>` uses a Dropout layer with a rate of `r`
- `L<d>` uses a Linear layer of dimension `d`

`[A]` can be replaced with an activation layer, such as:

- `s` = sigmoid
- `t` = tanh
- `r` = relu
- `l` = linear (i.e., No non-linearity)
- `m` = softmax
- `n` = n/a

The VGSL module must starts with an embedding size: `E<dim>`.

Example: `[E200 L120 L200 Cr3,10 D3]` will use a Convolutional Layer of (3 ngram for 10 of dim) and a relu activation
over which 30% of dropout is applied before classification
85 changes: 54 additions & 31 deletions boudams/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@
from boudams.tagger import BoudamsTagger, OptimizerParams
from boudams.trainer import Trainer, logger, ACCEPTABLE_MONITOR_METRICS
from boudams.encoder import LabelEncoder
from boudams.modes import SimpleSpaceMode, AdvancedSpaceMode
from boudams.dataset import BoudamsDataset
from boudams.data_generation import conllu, base as dataset_base, plaintext
from boudams.data_generation import base as dataset_base, plaintext, splitter as all_splitters
from boudams.utils import parse_params


_POSSIBLE_MODES = list(LabelEncoder.Modes.keys())


@click.group()
Expand All @@ -28,41 +33,54 @@ def dataset():
""" Dataset related functions """


def _get_mode(mode: str, mode_kwargs: str = "") -> SimpleSpaceMode:
if mode == "simple-space":
return SimpleSpaceMode()
elif mode == "advanced-space":
return AdvancedSpaceMode()


@dataset.command("convert")
@click.argument("method", type=click.Choice(['tsv', 'tsv-header', 'plain-text']))
@click.argument("output_path", type=click.Path(file_okay=False))
@click.argument("splitter", type=click.Choice(['words', 'sentence']))
@click.argument("input_path", nargs=-1, type=click.Path(file_okay=True, dir_okay=False))
@click.option("--min_words", type=int, default=2, help="Minimum of words to build a line")
@click.option("--max_words", type=int, default=10, help="Maximum number of words to build a line")
@click.option("--min_char_length", type=int, default=7, help="Minimum amount of characters to build a line")
@click.option("--max_char_length", type=int, default=100, help="Maximum amount of characters to build a line")
@click.option("--random_keep", type=float, default=0.3, help="Probability to keep some words for the next sequence")
@click.option("--max_kept", type=int, default=1, help="Maximum amount of words to be kept over next sequence")
@click.option("--noise_char", type=str, default=".", help="Character to add between words for noise purposes")
@click.option("--noise_char_random", type=float, default=0.2, help="Probability to add [NOISE_CHAR] in between words")
@click.option("--max_noise_char", type=int, default=2, help="Maximum amount of [NOISE_CHAR] to add sequentially")
def convert(method, output_path, input_path, min_words, max_words, min_char_length,
max_char_length, random_keep, max_kept, noise_char, noise_char_random, max_noise_char):
@click.argument("output_path", type=click.Path(file_okay=False))
@click.option("--mode", type=click.Choice(_POSSIBLE_MODES),
default="simple-space", show_default=True,
help="Type of encoder you want to set-up")
@click.option("--splitter-regex", type=str, default=None, show_default=True,
help="Regular expression for some splitter")
@click.option("--min-chars", type=int, default=2, show_default=True,
help="Discard samples smaller than min-chars")
@click.option("--min_words", type=int, default=2, show_default=True,
help="Minimum of words to build a line [Word splitter only]")
@click.option("--max_words", type=int, default=10, show_default=True,
help="Maximum number of words to build a line [Word splitter only]")
@click.option("--mode-ratios", type=str, default="", show_default=True,
help="Token ratios for modes at mask generation. Eg. `keep-space=.3&fake-space=.01`"
"will have a 30% chance of keeping a space and a 1% one to generate fake space after each char")
def convert(output_path, input_path, mode, splitter, splitter_regex, min_words, max_words, min_chars,
mode_ratios):
""" Build sequence training data using files with [METHOD] format in [INPUT_PATH] and saving the
converted format into [OUTPUT_PATH]
If you are using `tsv-header` as a method, columns containing tokens should be named "tokens" or "form"
"""
if method.startswith("tsv"):
conllu.convert(
input_path, output_path, min_words=min_words, max_words=max_words,
min_char_length=min_char_length, max_char_length=max_char_length,
random_keep=random_keep, max_kept=max_kept, noise_char=noise_char,
noise_char_random=noise_char_random, max_noise_char=max_noise_char,
dict_reader=method.endswith("header")
if splitter == "words":
splitter = all_splitters.WordSplitter(
min_words=min_words,
max_words=max_words,
**({"splitter": splitter_regex} if splitter_regex else {})
)
else:
plaintext.convert(
input_path, output_path, min_words=min_words, max_words=max_words,
min_char_length=min_char_length, max_char_length=max_char_length,
random_keep=random_keep, max_kept=max_kept, noise_char=noise_char,
noise_char_random=noise_char_random, max_noise_char=max_noise_char
splitter = all_splitters.SentenceSplitter(
**({"splitter": splitter_regex} if splitter_regex else {})
)
plaintext.convert(
input_path, output_path,
splitter=splitter, mode=_get_mode(mode=mode),
min_chars=min_chars,
token_ratio=parse_params(mode_ratios)
)


@dataset.command("statistics")
Expand Down Expand Up @@ -125,7 +143,8 @@ def generate(output_path, input_path, max_char_length, train_ratio, test_ratio):
return
dataset_base.split(input_path, output_path, max_char_length=max_char_length,
ratio=(train_ratio, dev_ratio, test_ratio))
dataset_base.check(output_path, max_length=max_char_length)
#dataset_base.check(output_path, max_length=max_char_length)


@cli.command("template")
@click.argument("filename", type=click.File(mode="w"))
Expand Down Expand Up @@ -162,6 +181,9 @@ def template(filename):

@cli.command("train")
@click.argument("config_files", nargs=-1, type=click.File("r"))
@click.option("--mode", type=click.Choice(_POSSIBLE_MODES),
default="simple-space", show_default=True,
help="Type of encoder you want to set-up")
@click.option("--output", type=click.Path(dir_okay=False, exists=False), default=None, help="Model Name")
@click.option("--epochs", type=int, default=100, help="Number of epochs to run")
@click.option("--batch_size", type=int, default=32, help="Size of batches")
Expand All @@ -173,16 +195,16 @@ def template(filename):
@click.option("--avg", default="macro", type=click.Choice(["micro", "macro"]), help="Type of avering method to use on "
"metrics")
@click.option("--delta", default=.001, type=float, help="Minimum change in the monitored quantity to qualify as an "
"improvement")
"improvement")
@click.option("--patience", default=3, type=int, help="Number of checks with no improvement after which training "
"will be stopped")
"will be stopped")
@click.option("--seed", default=None, type=int, help="Runs deterministic training")
@click.option("--optimizer", default="Adams", type=click.Choice(["Adams"]), help="Optimizer to use")
# ToDo: Figure out the bug with Ranger
# pytorch_lightning.utilities.exceptions.MisconfigurationException: The closure hasn't been executed. HINT: did you call
# `optimizer_closure()` in your `optimizer_step` hook? It could also happen because the
# `optimizer.step(optimizer_closure)` call did not execute it internally.
def train(config_files: List[click.File], output: str,
def train(config_files: List[click.File], output: str, mode: str,
epochs: int, batch_size: int, device: str, debug: bool, workers: int,
auto_lr: bool,
metric: str, avg: str, delta: float, patience: int,
Expand Down Expand Up @@ -210,6 +232,7 @@ def train(config_files: List[click.File], output: str,

vocabulary = LabelEncoder(
maximum_length=config.get("max_sentence_size", None),
mode=mode,
remove_diacriticals=config["label_encoder"].get("normalize", True),
lower=config["label_encoder"].get("lower", True)
)
Expand Down Expand Up @@ -414,7 +437,7 @@ def tag_check(config_model, content, device="cpu", batch_size=64):
boudams.eval()
boudams.to(device)
click.echo(f"\t[X] Model loaded")
click.echo(" ".join(boudams.annotate_text(content, batch_size=batch_size, device=device)))
click.echo("\n".join(boudams.annotate_text(content, splitter="([\.!\?]+)", batch_size=batch_size, device=device)))


@cli.command("graph")
Expand Down
59 changes: 12 additions & 47 deletions boudams/data_generation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,18 @@
import os
import random
import re
from typing import List, Tuple, Iterable, Union

from typing import Tuple, Iterable, Union

_space = re.compile("(\s+)")


def normalize_space(string: str) -> str:
""" Normalizes the space by replace any sequence of any spaces to a simple space ' ' (%20)"""
return _space.sub(" ", string)


def untokenize(sentence: Iterable[str]) -> Tuple[str, str]:
""" Transform a sequence of words into both a string without space (first
element of the tuple) and a string with space (second element of the tuple)"""
return "".join(sentence), " ".join(sentence)


def formatter(sequence: Iterable[str]):
""" Joins a sequence of words into Training and Ground Truth format
:param sequence: Sequence of words
:return:
"""
return "\t".join(untokenize(sequence)).replace("\n", "") + "\n"


def write_sentence(io_file, sentence: List[str], max_chars: int = 150):
""" Write
:param io_file: File to write to
:param sentence: Sequence for training and ground_truth
:param max_chars: Maximum number of characters to keep
:return:
"""
sequence = []
for word in sentence:
if len(" ".join(sequence)) >= max_chars:
io_file.write(formatter(sequence))
sequence = []
sequence.append(word)

if len(sequence):
io_file.write(formatter(sequence))


def check(input_path: str, max_length: int = 100):
""" Check train.tsv, dev.tsv and test.tsv in [INPUT_PATH] and print report
:param input_path: Directory containing train, dev and test
:param max_length: Maximum length of character for input or output
"""
raise Exception("This function has not been upgraded to the mode system")
files = ("test.tsv", "dev.tsv", "train.tsv")
for file in files:
max_chars, min_chars, min_words, max_words = 0, max_length, max_length, 0
Expand Down Expand Up @@ -85,15 +46,19 @@ def check(input_path: str, max_length: int = 100):
print("------")


def split(input_path: Union[str, Iterable[str]], output_path: str, ratio: Tuple[float, float, float] = (0.8, 0.1, 0.1),
max_char_length: int = 150):
def split(
input_path: Union[str, Iterable[str]],
output_path: str,
ratio: Tuple[float, float, float] = (0.8, 0.1, 0.1),
max_char_length: int = 150
):
""" Split a corpus of files into train, dev and test set
:param input_path: List of path of Glib-Like path
:param output_path: Where to save !
:param ratio: Ratio (Train, Dev, Test)
:param max_char_length: Maximum length of input or output
"""
max_char_length -= 2 # Remove SOS and EOS token

train_ratio, dev_ratio, test_ratio = ratio
if train_ratio + dev_ratio + test_ratio != 1.0:
Expand Down Expand Up @@ -133,9 +98,9 @@ def split(input_path: Union[str, Iterable[str]], output_path: str, ratio: Tuple[
tgt = dev_io
cur = line.split("\t")

if len(cur[0]) > max_char_length or len(cur[1].strip()) > max_char_length:
print("---- [ERROR] Line %s is ignored because it's too large ! `%s`" % (line_index, line))
continue
#if len(cur[0]) > max_char_length or len(cur[1].strip()) > max_char_length:
# print("---- [ERROR] Line %s is ignored because it's too large ! `%s`" % (line_index, line))
# continue
tgt.write(line)

print("[DONE] Files available at %s " % output_path)
120 changes: 0 additions & 120 deletions boudams/data_generation/conllu.py

This file was deleted.

Loading

0 comments on commit d8ed1d5

Please sign in to comment.