Skip to content

Commit

Permalink
Prevision du cli
Browse files Browse the repository at this point in the history
PonteIneptique committed May 25, 2019
1 parent c4435aa commit 6ac1755
Showing 8 changed files with 29 additions and 34 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -11,6 +11,13 @@ space and they are separated by tabs (`\t`, marked here as `<TAB>`).
Things needs a little more tweaks here and there again, I'd like to see how Attention will perform. This model is
particulary built for OCR/HTR output from manuscripts where spaces are inconsistent.


```text
Train Loss: 0.004 | Perplexity: 1.004 | Acc.: 0.566 | Lev.: 0.037 | Lev. / char: 0.001
Val. Loss: 0.066 | Perplexity: 1.069 | Acc.: 0.585 | Lev.: 0.272 | Lev. / char: 0.009
Test Loss: 0.057 | Perplexity: 1.059 | Acc,: 0.586 | Lev.: 0.235 | Lev. / char: 0.008
```

## Examples

### BiDirectional GRU with Attention
Empty file added boudams/cli.py
Empty file.
10 changes: 7 additions & 3 deletions boudams/dataset/utils.py
Original file line number Diff line number Diff line change
@@ -200,6 +200,10 @@ def check(input_path, max_length=100):


if __name__ == "__main__":
convert("/home/thibault/dev/LiSeinConfessorPandora/data/lemmatises/*.tsv", "/home/thibault/dev/boudams/data/seints")
split("/home/thibault/dev/boudams/data/seints/*")
check("/home/thibault/dev/boudams/data/seints/")
output = "/home/thibault/dev/boudams/data/seints"
output = "/home/thibault/dev/boudams/data/fro"
inp = "/home/thibault/dev/LiSeinConfessorPandora/data/lemmatises/*.tsv"
inp = "/home/thibault/dev/boudams/data/inp/*.tab"
convert(inp, output, dict_reader=True)
split(output + "/*")
check(output+"/")
3 changes: 2 additions & 1 deletion boudams/encoder.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@
import random
import json
import unidecode

from operator import itemgetter

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEFAULT_INIT_TOKEN = "<SOS>"
@@ -299,6 +299,7 @@ def pad_and_tensorize(
# Packed sequence need to be in decreasing size order
for current in sequences:
order.append(sentences.index(current))
sentences[order[-1]] = None # We replace this index with nothing in case some segments are equals
tensor.append(current + [self.pad_token_index] * (max_len - len(current)))
lengths.append(len(tensor[-1]) - max(0, max_len - len(current)))

37 changes: 10 additions & 27 deletions boudams/tagger.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import torch
import torch.cuda

from torchtext.data import ReversibleField, BucketIterator

import os
import json
import tarfile
import logging
import re
from typing import List, Tuple

from .model import gru, lstm, bidir, conv, linear
@@ -172,24 +171,6 @@ def sostoken(self):
def eostoken(self):
return self.vocabulary.eos_token_index

def tag(self, iterator: BucketIterator):
self.model.eval()
for i, batch in enumerate(iterator):
src, src_len = batch.src
output, attention = self.model(
src, src_len, trg=None,
teacher_forcing_ratio=0
) # turn off teacher forcing

# trg = [trg sent len, batch size]
# output = [Maximum Sentence Length, Number of Sentence in batch, Number of possible characters]
_, ind = torch.topk(output, 1, dim=2)
# ind = [Maximum Sentence Length, Number of Sentences in Batch, One Result]

# output = output[1:].view(-1, output.shape[-1])

yield ind.squeeze().permute(1, 0)

@property
def settings(self):
return {
@@ -249,13 +230,15 @@ def annotate(self, texts: List[str], batch_size=32):
translations = self.model.predict(
tensor, sentence_length, label_encoder=self.vocabulary
)

for index in range(len(batch)):
for index in range(len(translations)):
yield "".join(translations[order.index(index)])

def annotate_text(self, string, batch_size=32):
strings = [
string[n:n+self.out_max_sentence_length-10]
for n in range(0, len(string), self.out_max_sentence_length - 10)
]
def annotate_text(self, string, splitter=r"(\W+)", batch_size=32):
splitter = re.compile(splitter)
splits = splitter.split(string)

tempList = splits + [""] * 2
strings = ["".join(tempList[n:n + 2]) for n in range(0, len(splits), 2)]
strings = list(filter(len, strings))

yield from self.annotate(strings, batch_size=batch_size)
2 changes: 1 addition & 1 deletion linear_run.py
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@
logger.setLevel(logging.DEBUG)

EPOCHS = 100
TEST = "seints"
TEST = "fro"
RANDOM = True
DEVICE = "cuda"
MAXIMUM_LENGTH = 100
2 changes: 1 addition & 1 deletion test.py
Original file line number Diff line number Diff line change
@@ -21,7 +21,7 @@
train_path, dev_path, test_path = "data/fro/train.tsv", "data/fro/dev.tsv", "data/fro/test.tsv"


for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--14:08:58-0.0001.tar"):
for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--23:53:32-0.0001.tar"):
tokenizer = Seq2SeqTokenizer.load(model, device=DEVICE)
print("Model : " + tokenizer.system.upper() + " from " + model)
test_data = tokenizer.vocabulary.get_dataset(test_path, randomized=False)
2 changes: 1 addition & 1 deletion voc-2.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "r", "4": "!", "6": " ", "7": "q", "8": "t", "9": "p", "10": "v", "11": "x", "12": ";", "13": "l", "14": "n", "15": ".", "16": "s", "17": "g", "18": ")", "19": "o", "20": "z", "21": "e", "22": "k", "23": "(", "24": "d", "25": ":", "26": "a", "27": "?", "28": "u", "29": "b", "30": "j", "31": "i", "32": "'", "33": "y", "34": "f", "35": "m", "36": "w", "37": "c", "38": "]", "39": "[", "40": "h", "41": ",", "42": "-"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, "!": 4, "r": 5, " ": 6, "q": 7, "t": 8, "p": 9, "v": 10, "x": 11, ";": 12, "l": 13, "n": 14, ".": 15, "s": 16, "g": 17, ")": 18, "o": 19, "z": 20, "e": 21, "k": 22, "(": 23, "d": 24, ":": 25, "a": 26, "?": 27, "u": 28, "b": 29, "j": 30, "i": 31, "'": 32, "y": 33, "f": 34, "m": 35, "w": 36, "c": 37, "]": 38, "[": 39, "h": 40, ",": 41, "-": 42}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "1", "4": ",", "6": "6", "7": "r", "8": "2", "9": "s", "10": "u", "11": "o", "12": "h", "13": "k", "14": "3", "15": "a", "16": "z", "17": "c", "18": "m", "19": "\"", "20": "q", "21": ">", "22": "g", "23": "0", "24": "w", "25": "!", "26": ";", "27": "_", "28": "n", "29": " ", "30": "v", "31": "y", "32": ":", "33": "?", "34": "b", "35": "'", "36": "p", "37": "d", "38": "l", "39": "i", "40": ".", "41": "j", "42": "x", "43": "f", "44": "-", "45": "t", "46": "e"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, ",": 4, "1": 5, "6": 6, "r": 7, "2": 8, "s": 9, "u": 10, "o": 11, "h": 12, "k": 13, "3": 14, "a": 15, "z": 16, "c": 17, "m": 18, "\"": 19, "q": 20, ">": 21, "g": 22, "0": 23, "w": 24, "!": 25, ";": 26, "_": 27, "n": 28, " ": 29, "v": 30, "y": 31, ":": 32, "?": 33, "b": 34, "'": 35, "p": 36, "d": 37, "l": 38, "i": 39, ".": 40, "j": 41, "x": 42, "f": 43, "-": 44, "t": 45, "e": 46}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}

0 comments on commit 6ac1755

Please sign in to comment.