From 6ac1755afd09bf598d8b10a906e20b97941b3cba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sat, 25 May 2019 07:56:25 +0200 Subject: [PATCH] Prevision du cli --- README.md | 7 +++++++ boudams/cli.py | 0 boudams/dataset/utils.py | 10 +++++++--- boudams/encoder.py | 3 ++- boudams/tagger.py | 37 ++++++++++--------------------------- linear_run.py | 2 +- test.py | 2 +- voc-2.json | 2 +- 8 files changed, 29 insertions(+), 34 deletions(-) create mode 100644 boudams/cli.py diff --git a/README.md b/README.md index 3d657db..1aa4d79 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,13 @@ space and they are separated by tabs (`\t`, marked here as ``). Things needs a little more tweaks here and there again, I'd like to see how Attention will perform. This model is particulary built for OCR/HTR output from manuscripts where spaces are inconsistent. + +```text + Train Loss: 0.004 | Perplexity: 1.004 | Acc.: 0.566 | Lev.: 0.037 | Lev. / char: 0.001 + Val. Loss: 0.066 | Perplexity: 1.069 | Acc.: 0.585 | Lev.: 0.272 | Lev. / char: 0.009 + Test Loss: 0.057 | Perplexity: 1.059 | Acc,: 0.586 | Lev.: 0.235 | Lev. / char: 0.008 +``` + ## Examples ### BiDirectional GRU with Attention diff --git a/boudams/cli.py b/boudams/cli.py new file mode 100644 index 0000000..e69de29 diff --git a/boudams/dataset/utils.py b/boudams/dataset/utils.py index 63d10ee..1b5d0d6 100644 --- a/boudams/dataset/utils.py +++ b/boudams/dataset/utils.py @@ -200,6 +200,10 @@ def check(input_path, max_length=100): if __name__ == "__main__": - convert("/home/thibault/dev/LiSeinConfessorPandora/data/lemmatises/*.tsv", "/home/thibault/dev/boudams/data/seints") - split("/home/thibault/dev/boudams/data/seints/*") - check("/home/thibault/dev/boudams/data/seints/") + output = "/home/thibault/dev/boudams/data/seints" + output = "/home/thibault/dev/boudams/data/fro" + inp = "/home/thibault/dev/LiSeinConfessorPandora/data/lemmatises/*.tsv" + inp = "/home/thibault/dev/boudams/data/inp/*.tab" + convert(inp, output, dict_reader=True) + split(output + "/*") + check(output+"/") diff --git a/boudams/encoder.py b/boudams/encoder.py index c54cf08..22d0867 100644 --- a/boudams/encoder.py +++ b/boudams/encoder.py @@ -7,7 +7,7 @@ import random import json import unidecode - +from operator import itemgetter DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") DEFAULT_INIT_TOKEN = "" @@ -299,6 +299,7 @@ def pad_and_tensorize( # Packed sequence need to be in decreasing size order for current in sequences: order.append(sentences.index(current)) + sentences[order[-1]] = None # We replace this index with nothing in case some segments are equals tensor.append(current + [self.pad_token_index] * (max_len - len(current))) lengths.append(len(tensor[-1]) - max(0, max_len - len(current))) diff --git a/boudams/tagger.py b/boudams/tagger.py index 63ce66f..e269b75 100644 --- a/boudams/tagger.py +++ b/boudams/tagger.py @@ -1,12 +1,11 @@ import torch import torch.cuda -from torchtext.data import ReversibleField, BucketIterator - import os import json import tarfile import logging +import re from typing import List, Tuple from .model import gru, lstm, bidir, conv, linear @@ -172,24 +171,6 @@ def sostoken(self): def eostoken(self): return self.vocabulary.eos_token_index - def tag(self, iterator: BucketIterator): - self.model.eval() - for i, batch in enumerate(iterator): - src, src_len = batch.src - output, attention = self.model( - src, src_len, trg=None, - teacher_forcing_ratio=0 - ) # turn off teacher forcing - - # trg = [trg sent len, batch size] - # output = [Maximum Sentence Length, Number of Sentence in batch, Number of possible characters] - _, ind = torch.topk(output, 1, dim=2) - # ind = [Maximum Sentence Length, Number of Sentences in Batch, One Result] - - # output = output[1:].view(-1, output.shape[-1]) - - yield ind.squeeze().permute(1, 0) - @property def settings(self): return { @@ -249,13 +230,15 @@ def annotate(self, texts: List[str], batch_size=32): translations = self.model.predict( tensor, sentence_length, label_encoder=self.vocabulary ) - - for index in range(len(batch)): + for index in range(len(translations)): yield "".join(translations[order.index(index)]) - def annotate_text(self, string, batch_size=32): - strings = [ - string[n:n+self.out_max_sentence_length-10] - for n in range(0, len(string), self.out_max_sentence_length - 10) - ] + def annotate_text(self, string, splitter=r"(\W+)", batch_size=32): + splitter = re.compile(splitter) + splits = splitter.split(string) + + tempList = splits + [""] * 2 + strings = ["".join(tempList[n:n + 2]) for n in range(0, len(splits), 2)] + strings = list(filter(len, strings)) + yield from self.annotate(strings, batch_size=batch_size) diff --git a/linear_run.py b/linear_run.py index 74bc5f4..e6ae68e 100644 --- a/linear_run.py +++ b/linear_run.py @@ -9,7 +9,7 @@ logger.setLevel(logging.DEBUG) EPOCHS = 100 -TEST = "seints" +TEST = "fro" RANDOM = True DEVICE = "cuda" MAXIMUM_LENGTH = 100 diff --git a/test.py b/test.py index 5eedc72..997eb3a 100644 --- a/test.py +++ b/test.py @@ -21,7 +21,7 @@ train_path, dev_path, test_path = "data/fro/train.tsv", "data/fro/dev.tsv", "data/fro/test.tsv" -for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--14:08:58-0.0001.tar"): +for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--23:53:32-0.0001.tar"): tokenizer = Seq2SeqTokenizer.load(model, device=DEVICE) print("Model : " + tokenizer.system.upper() + " from " + model) test_data = tokenizer.vocabulary.get_dataset(test_path, randomized=False) diff --git a/voc-2.json b/voc-2.json index 7689ce9..83f15c5 100644 --- a/voc-2.json +++ b/voc-2.json @@ -1 +1 @@ -{"itos": {"0": "", "1": "", "2": "", "5": "r", "4": "!", "6": " ", "7": "q", "8": "t", "9": "p", "10": "v", "11": "x", "12": ";", "13": "l", "14": "n", "15": ".", "16": "s", "17": "g", "18": ")", "19": "o", "20": "z", "21": "e", "22": "k", "23": "(", "24": "d", "25": ":", "26": "a", "27": "?", "28": "u", "29": "b", "30": "j", "31": "i", "32": "'", "33": "y", "34": "f", "35": "m", "36": "w", "37": "c", "38": "]", "39": "[", "40": "h", "41": ",", "42": "-"}, "stoi": {"": 0, "": 1, "": 2, "": 5, "!": 4, "r": 5, " ": 6, "q": 7, "t": 8, "p": 9, "v": 10, "x": 11, ";": 12, "l": 13, "n": 14, ".": 15, "s": 16, "g": 17, ")": 18, "o": 19, "z": 20, "e": 21, "k": 22, "(": 23, "d": 24, ":": 25, "a": 26, "?": 27, "u": 28, "b": 29, "j": 30, "i": 31, "'": 32, "y": 33, "f": 34, "m": 35, "w": 36, "c": 37, "]": 38, "[": 39, "h": 40, ",": 41, "-": 42}, "params": {"init_token": "", "eos_token": "", "pad_token": "", "unk_token": "", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}} \ No newline at end of file +{"itos": {"0": "", "1": "", "2": "", "5": "1", "4": ",", "6": "6", "7": "r", "8": "2", "9": "s", "10": "u", "11": "o", "12": "h", "13": "k", "14": "3", "15": "a", "16": "z", "17": "c", "18": "m", "19": "\"", "20": "q", "21": ">", "22": "g", "23": "0", "24": "w", "25": "!", "26": ";", "27": "_", "28": "n", "29": " ", "30": "v", "31": "y", "32": ":", "33": "?", "34": "b", "35": "'", "36": "p", "37": "d", "38": "l", "39": "i", "40": ".", "41": "j", "42": "x", "43": "f", "44": "-", "45": "t", "46": "e"}, "stoi": {"": 0, "": 1, "": 2, "": 5, ",": 4, "1": 5, "6": 6, "r": 7, "2": 8, "s": 9, "u": 10, "o": 11, "h": 12, "k": 13, "3": 14, "a": 15, "z": 16, "c": 17, "m": 18, "\"": 19, "q": 20, ">": 21, "g": 22, "0": 23, "w": 24, "!": 25, ";": 26, "_": 27, "n": 28, " ": 29, "v": 30, "y": 31, ":": 32, "?": 33, "b": 34, "'": 35, "p": 36, "d": 37, "l": 38, "i": 39, ".": 40, "j": 41, "x": 42, "f": 43, "-": 44, "t": 45, "e": 46}, "params": {"init_token": "", "eos_token": "", "pad_token": "", "unk_token": "", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}} \ No newline at end of file