From 6ac1755afd09bf598d8b10a906e20b97941b3cba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= <leponteineptique@gmail.com>
Date: Sat, 25 May 2019 07:56:25 +0200
Subject: [PATCH] Prevision du cli

---
 README.md                |  7 +++++++
 boudams/cli.py           |  0
 boudams/dataset/utils.py | 10 +++++++---
 boudams/encoder.py       |  3 ++-
 boudams/tagger.py        | 37 ++++++++++---------------------------
 linear_run.py            |  2 +-
 test.py                  |  2 +-
 voc-2.json               |  2 +-
 8 files changed, 29 insertions(+), 34 deletions(-)
 create mode 100644 boudams/cli.py
diff --git a/README.md b/README.md
index 3d657db..1aa4d79 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,13 @@ space and they are separated by tabs (`\t`, marked here as `<TAB>`).
 Things needs a little more tweaks here and there again, I'd like to see how Attention will perform. This model is 
 particulary built for OCR/HTR output from manuscripts where spaces are inconsistent.
 
+
+```text
+	Train Loss: 0.004 | Perplexity:   1.004 |  Acc.: 0.566 |  Lev.: 0.037 |  Lev. / char: 0.001
+	 Val. Loss: 0.066 | Perplexity:   1.069 |  Acc.: 0.585 |  Lev.: 0.272 |  Lev. / char: 0.009
+	 Test Loss: 0.057 | Perplexity:   1.059 |  Acc,: 0.586 |  Lev.: 0.235 |  Lev. / char: 0.008
+```
+
 ## Examples
 
 ### BiDirectional GRU with Attention
diff --git a/boudams/cli.py b/boudams/cli.py
new file mode 100644
index 0000000..e69de29
diff --git a/boudams/dataset/utils.py b/boudams/dataset/utils.py
index 63d10ee..1b5d0d6 100644
--- a/boudams/dataset/utils.py
+++ b/boudams/dataset/utils.py
@@ -200,6 +200,10 @@ def check(input_path, max_length=100):
 
 
 if __name__ == "__main__":
-    convert("/home/thibault/dev/LiSeinConfessorPandora/data/lemmatises/*.tsv", "/home/thibault/dev/boudams/data/seints")
-    split("/home/thibault/dev/boudams/data/seints/*")
-    check("/home/thibault/dev/boudams/data/seints/")
+    output = "/home/thibault/dev/boudams/data/seints"
+    output = "/home/thibault/dev/boudams/data/fro"
+    inp = "/home/thibault/dev/LiSeinConfessorPandora/data/lemmatises/*.tsv"
+    inp = "/home/thibault/dev/boudams/data/inp/*.tab"
+    convert(inp, output, dict_reader=True)
+    split(output + "/*")
+    check(output+"/")
diff --git a/boudams/encoder.py b/boudams/encoder.py
index c54cf08..22d0867 100644
--- a/boudams/encoder.py
+++ b/boudams/encoder.py
@@ -7,7 +7,7 @@
 import random
 import json
 import unidecode
-
+from operator import itemgetter
 
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 DEFAULT_INIT_TOKEN = "<SOS>"
@@ -299,6 +299,7 @@ def pad_and_tensorize(
         # Packed sequence need to be in decreasing size order
         for current in sequences:
             order.append(sentences.index(current))
+            sentences[order[-1]] = None  # We replace this index with nothing in case some segments are equals
             tensor.append(current + [self.pad_token_index] * (max_len - len(current)))
             lengths.append(len(tensor[-1]) - max(0, max_len - len(current)))
 
diff --git a/boudams/tagger.py b/boudams/tagger.py
index 63ce66f..e269b75 100644
--- a/boudams/tagger.py
+++ b/boudams/tagger.py
@@ -1,12 +1,11 @@
 import torch
 import torch.cuda
 
-from torchtext.data import ReversibleField, BucketIterator
-
 import os
 import json
 import tarfile
 import logging
+import re
 from typing import List, Tuple
 
 from .model import gru, lstm, bidir, conv, linear
@@ -172,24 +171,6 @@ def sostoken(self):
     def eostoken(self):
         return self.vocabulary.eos_token_index
 
-    def tag(self, iterator: BucketIterator):
-        self.model.eval()
-        for i, batch in enumerate(iterator):
-            src, src_len = batch.src
-            output, attention = self.model(
-                src, src_len, trg=None,
-                teacher_forcing_ratio=0
-            )  # turn off teacher forcing
-
-            # trg = [trg sent len, batch size]
-            # output = [Maximum Sentence Length, Number of Sentence in batch, Number of possible characters]
-            _, ind = torch.topk(output, 1, dim=2)
-            # ind = [Maximum Sentence Length, Number of Sentences in Batch, One Result]
-
-            # output = output[1:].view(-1, output.shape[-1])
-
-            yield ind.squeeze().permute(1, 0)
-
     @property
     def settings(self):
         return {
@@ -249,13 +230,15 @@ def annotate(self, texts: List[str], batch_size=32):
             translations = self.model.predict(
                 tensor, sentence_length, label_encoder=self.vocabulary
             )
-
-            for index in range(len(batch)):
+            for index in range(len(translations)):
                 yield "".join(translations[order.index(index)])
 
-    def annotate_text(self, string, batch_size=32):
-        strings = [
-            string[n:n+self.out_max_sentence_length-10]
-            for n in range(0, len(string), self.out_max_sentence_length - 10)
-        ]
+    def annotate_text(self, string, splitter=r"(\W+)", batch_size=32):
+        splitter = re.compile(splitter)
+        splits = splitter.split(string)
+
+        tempList = splits + [""] * 2
+        strings = ["".join(tempList[n:n + 2]) for n in range(0, len(splits), 2)]
+        strings = list(filter(len, strings))
+
         yield from self.annotate(strings, batch_size=batch_size)
diff --git a/linear_run.py b/linear_run.py
index 74bc5f4..e6ae68e 100644
--- a/linear_run.py
+++ b/linear_run.py
@@ -9,7 +9,7 @@
 logger.setLevel(logging.DEBUG)
 
 EPOCHS = 100
-TEST = "seints"
+TEST = "fro"
 RANDOM = True
 DEVICE = "cuda"
 MAXIMUM_LENGTH = 100
diff --git a/test.py b/test.py
index 5eedc72..997eb3a 100644
--- a/test.py
+++ b/test.py
@@ -21,7 +21,7 @@
     train_path, dev_path, test_path = "data/fro/train.tsv", "data/fro/dev.tsv", "data/fro/test.tsv"
 
 
-for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--14:08:58-0.0001.tar"):
+for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--23:53:32-0.0001.tar"):
     tokenizer = Seq2SeqTokenizer.load(model, device=DEVICE)
     print("Model : " + tokenizer.system.upper() + " from  " + model)
     test_data = tokenizer.vocabulary.get_dataset(test_path, randomized=False)
diff --git a/voc-2.json b/voc-2.json
index 7689ce9..83f15c5 100644
--- a/voc-2.json
+++ b/voc-2.json
@@ -1 +1 @@
-{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "r", "4": "!", "6": " ", "7": "q", "8": "t", "9": "p", "10": "v", "11": "x", "12": ";", "13": "l", "14": "n", "15": ".", "16": "s", "17": "g", "18": ")", "19": "o", "20": "z", "21": "e", "22": "k", "23": "(", "24": "d", "25": ":", "26": "a", "27": "?", "28": "u", "29": "b", "30": "j", "31": "i", "32": "'", "33": "y", "34": "f", "35": "m", "36": "w", "37": "c", "38": "]", "39": "[", "40": "h", "41": ",", "42": "-"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, "!": 4, "r": 5, " ": 6, "q": 7, "t": 8, "p": 9, "v": 10, "x": 11, ";": 12, "l": 13, "n": 14, ".": 15, "s": 16, "g": 17, ")": 18, "o": 19, "z": 20, "e": 21, "k": 22, "(": 23, "d": 24, ":": 25, "a": 26, "?": 27, "u": 28, "b": 29, "j": 30, "i": 31, "'": 32, "y": 33, "f": 34, "m": 35, "w": 36, "c": 37, "]": 38, "[": 39, "h": 40, ",": 41, "-": 42}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}
\ No newline at end of file
+{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "1", "4": ",", "6": "6", "7": "r", "8": "2", "9": "s", "10": "u", "11": "o", "12": "h", "13": "k", "14": "3", "15": "a", "16": "z", "17": "c", "18": "m", "19": "\"", "20": "q", "21": ">", "22": "g", "23": "0", "24": "w", "25": "!", "26": ";", "27": "_", "28": "n", "29": " ", "30": "v", "31": "y", "32": ":", "33": "?", "34": "b", "35": "'", "36": "p", "37": "d", "38": "l", "39": "i", "40": ".", "41": "j", "42": "x", "43": "f", "44": "-", "45": "t", "46": "e"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, ",": 4, "1": 5, "6": 6, "r": 7, "2": 8, "s": 9, "u": 10, "o": 11, "h": 12, "k": 13, "3": 14, "a": 15, "z": 16, "c": 17, "m": 18, "\"": 19, "q": 20, ">": 21, "g": 22, "0": 23, "w": 24, "!": 25, ";": 26, "_": 27, "n": 28, " ": 29, "v": 30, "y": 31, ":": 32, "?": 33, "b": 34, "'": 35, "p": 36, "d": 37, "l": 38, "i": 39, ".": 40, "j": 41, "x": 42, "f": 43, "-": 44, "t": 45, "e": 46}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}
\ No newline at end of file