Skip to content

Commit

Permalink
More or less working annotate_text. Should split on spaces...
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed May 24, 2019
1 parent aa190d2 commit c4435aa
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 22 deletions.
33 changes: 33 additions & 0 deletions annotate_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
input_text = """Segneur ,%% sachies que Mil et .C. .iiijxx. et .xvii. ans apries l'incarnation ihesucrist ,%.%. Au tans Innocent ,%% l'apostole de Rome ,%,%. et Phelippon ,%% roi de france ,%%
et Richart ,%% roi d'engleterre ,%,%. ot vn saint home en france ,%,%. qui ot nom Fouques de Nuelly (%.%. Cil Nuellys siet entre Nuelly sour Marne %,%. et paris )%.%. Et il estoit priessres et tenoit la perroche de la uille .%.%. Et ichil Fouques dont ie vos di
commencha a parle de diu %,%. par france %,%.
et par les autres pais entour ;%.%.
Et sachies que nostre sires fist maintes bieles miracles pour lui .%,%. et tant que la renommee de cel saint home ala ,%% tant qu'ele vint a l'apostole de Rome Innocent ;%.%. Et l'apostoles manda en france au saint home %,%. que il preechast des crois par s'auctorite ;%.%. Et apres i enuoia .i. sien cardonnal ,%% Maistre Pieron de Capes ,%% croisie ,%,%. et manda par lui le pardon tel con ie vous dirai :%.%.
Tout chil qui se croiseroient %,%. et feroient le sieruice diu .i. an en l'ost %,%[punctelev]
seroient quite de toz lor pechies quil auoient fais ,%% dont il seroient confies .%.%.
Pour che que chius pardons fu si grans ,%,%. si s'en esmurent moult li cuer des gens ,%,%. et moult s'en croisierent
pour chou que li pardons estoit si grans .§%.§%.
EN l'autre an apries que chil preudom Fouques parla de diu ,%,%[punctelev] ot .i. tournoi en champaigne ,%%
a .i. castiel qui a non Aicri .%,%. et par la grace de diu si auint ke Thiebaus ,%% quens de champaigne
et de Brie ,%% prist la crois ,%,%. et li cuens Looys de Bloys %,%. et de chartaing .%.%.
Et che fu a l'entree des Auens .%,%. et chil cuens thiebaus estoit iouenes hom et n'auoit pas plus de .xxij. ans ,%.%. Ne li cuens Looys n'auoit pas plus de .xxvij. ans .%,%. Chil doi conte ierent neueu le roi de france %,%. et cousin germain et neueu le roi d'engleterre %.%. De l'autre part .%% auoec ces .ij. contes se croisierent doi moult haut baron de france ,%.%. Symons de Montfort %,%. et Renaus de Mommirail .%.%. Moult fu grans la renommee par les terres .%,%[punctelev] quant cil doi se croisierent .§%.§%.
EN la terre le conte de champaigne se croisa Gerniers li euesques de Troies ,%,%. et li cuens Gautiers de Braine ,%.%. Joffrois de Joinuile ,%,%.
qui estoit senescaus de la tiere ,%.%.
Robiers ses freres ,%.%. Gautiers de voignori ,%.%. Gautiers de Mombelyart ,%.%.
Eustasces d'escouflans ,%.%. Guis dou plaissie %,%. et ses freres ,%% Henris D'ardillieres ,%.%. Ogiers de saint chienon ,%.%.""".replace(
"%", "").replace("\n", " ").replace(" ", "")

print(input_text)

from boudams.tagger import Seq2SeqTokenizer
import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

tokenizer = Seq2SeqTokenizer.load("/home/thibault/dev/boudams/models/linear-conv2019-05-24--14:08:58-0.0001.tar", device="cpu")
print("".join(tokenizer.annotate_text(input_text)))
2 changes: 1 addition & 1 deletion boudams/model/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def predict(self, src, src_len, label_encoder: "LabelEncoder") -> torch.Tensor:
"""
out = self(src, src_len, None, teacher_forcing_ratio=0)
logits = torch.argmax(out, 2)
return label_encoder.reverse_batch(logits, masked=src)
return label_encoder.reverse_batch(logits, masked=src, ignore=(self.pad_idx, self.eos_idx, self.sos_idx))

def gradient(
self,
Expand Down
41 changes: 23 additions & 18 deletions boudams/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,26 +231,31 @@ def load(cls, fpath="./model.tar", device=DEVICE):

return obj

def annotate(self, texts: List[str]):
def annotate(self, texts: List[str], batch_size=32):
self.model.eval()
for sentence in texts:

# it would be good at some point to keep and use order to batchify this
tensor, sentence_length, _ = self.vocabulary.pad_and_tensorize(
[self.vocabulary.inp_to_numerical(self.vocabulary.prepare(sentence))[0]],
device=self.device,
padding=self.out_max_sentence_length-len(sentence)
)

from .model.base import pprint_2d
#pprint_2d(tensor.t())
#print(sentence_length)

logging.debug("Input Tensor {}".format(tensor.shape))
logging.debug("Input Positions tensor {}".format(sentence_length.shape))
for n in range(0, len(texts), batch_size):
batch = texts[n:n+batch_size]
xs = [
self.vocabulary.inp_to_numerical(self.vocabulary.prepare(s))
for s in batch
]
logging.info("Dealing with batch %s " % (int(n/batch_size)+1))
tensor, sentence_length, order = self.vocabulary.pad_and_tensorize(
[x for x, _ in xs],
device=self.device,
padding=max(list(map(lambda x: x[1], xs)))
)

translation = self.model.predict(
translations = self.model.predict(
tensor, sentence_length, label_encoder=self.vocabulary
)

yield "".join(translation[0])
for index in range(len(batch)):
yield "".join(translations[order.index(index)])

def annotate_text(self, string, batch_size=32):
strings = [
string[n:n+self.out_max_sentence_length-10]
for n in range(0, len(string), self.out_max_sentence_length - 10)
]
yield from self.annotate(strings, batch_size=batch_size)
2 changes: 1 addition & 1 deletion linear_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

EPOCHS = 10
EPOCHS = 100
TEST = "seints"
RANDOM = True
DEVICE = "cuda"
Expand Down
2 changes: 1 addition & 1 deletion load.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
for line in Examples.split("\n")
]

for model in glob.glob("/home/thibault/dev/boudams/models/lstm2019-05-22--09:23:38-0.0001.tar"):
for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--14:08:58-0.0001.tar"):
tokenizer = Seq2SeqTokenizer.load(model, device="cpu")
print(tokenizer.model)
treated = tokenizer.annotate([x[0] for x in Examples])
Expand Down
2 changes: 1 addition & 1 deletion voc-2.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "l", "4": "r", "6": "u", "7": "g", "8": "x", "9": "c", "10": "]", "11": "z", "12": " ", "13": "j", "14": ";", "15": "e", "16": "f", "17": "a", "18": "[", "19": "k", "20": "h", "21": "?", "22": "y", "23": "!", "24": "w", "25": "v", "26": "-", "27": ")", "28": "m", "29": "s", "30": "q", "31": "d", "32": "i", "33": "t", "34": "'", "35": ",", "36": ".", "37": "(", "38": "n", "39": "b", "40": "p", "41": ":", "42": "o"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, "r": 4, "l": 5, "u": 6, "g": 7, "x": 8, "c": 9, "]": 10, "z": 11, " ": 12, "j": 13, ";": 14, "e": 15, "f": 16, "a": 17, "[": 18, "k": 19, "h": 20, "?": 21, "y": 22, "!": 23, "w": 24, "v": 25, "-": 26, ")": 27, "m": 28, "s": 29, "q": 30, "d": 31, "i": 32, "t": 33, "'": 34, ",": 35, ".": 36, "(": 37, "n": 38, "b": 39, "p": 40, ":": 41, "o": 42}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "r", "4": "!", "6": " ", "7": "q", "8": "t", "9": "p", "10": "v", "11": "x", "12": ";", "13": "l", "14": "n", "15": ".", "16": "s", "17": "g", "18": ")", "19": "o", "20": "z", "21": "e", "22": "k", "23": "(", "24": "d", "25": ":", "26": "a", "27": "?", "28": "u", "29": "b", "30": "j", "31": "i", "32": "'", "33": "y", "34": "f", "35": "m", "36": "w", "37": "c", "38": "]", "39": "[", "40": "h", "41": ",", "42": "-"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, "!": 4, "r": 5, " ": 6, "q": 7, "t": 8, "p": 9, "v": 10, "x": 11, ";": 12, "l": 13, "n": 14, ".": 15, "s": 16, "g": 17, ")": 18, "o": 19, "z": 20, "e": 21, "k": 22, "(": 23, "d": 24, ":": 25, "a": 26, "?": 27, "u": 28, "b": 29, "j": 30, "i": 31, "'": 32, "y": 33, "f": 34, "m": 35, "w": 36, "c": 37, "]": 38, "[": 39, "h": 40, ",": 41, "-": 42}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}

0 comments on commit c4435aa

Please sign in to comment.