-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
More or less working annotate_text. Should split on spaces...
- Loading branch information
1 parent
aa190d2
commit c4435aa
Showing
6 changed files
with
60 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
input_text = """Segneur ,%% sachies que Mil et .C. .iiijxx. et .xvii. ans apries l'incarnation ihesucrist ,%.%. Au tans Innocent ,%% l'apostole de Rome ,%,%. et Phelippon ,%% roi de france ,%% | ||
et Richart ,%% roi d'engleterre ,%,%. ot vn saint home en france ,%,%. qui ot nom Fouques de Nuelly (%.%. Cil Nuellys siet entre Nuelly sour Marne %,%. et paris )%.%. Et il estoit priessres et tenoit la perroche de la uille .%.%. Et ichil Fouques dont ie vos di | ||
commencha a parle de diu %,%. par france %,%. | ||
et par les autres pais entour ;%.%. | ||
Et sachies que nostre sires fist maintes bieles miracles pour lui .%,%. et tant que la renommee de cel saint home ala ,%% tant qu'ele vint a l'apostole de Rome Innocent ;%.%. Et l'apostoles manda en france au saint home %,%. que il preechast des crois par s'auctorite ;%.%. Et apres i enuoia .i. sien cardonnal ,%% Maistre Pieron de Capes ,%% croisie ,%,%. et manda par lui le pardon tel con ie vous dirai :%.%. | ||
Tout chil qui se croiseroient %,%. et feroient le sieruice diu .i. an en l'ost %,%[punctelev] | ||
seroient quite de toz lor pechies quil auoient fais ,%% dont il seroient confies .%.%. | ||
Pour che que chius pardons fu si grans ,%,%. si s'en esmurent moult li cuer des gens ,%,%. et moult s'en croisierent | ||
pour chou que li pardons estoit si grans .§%.§%. | ||
EN l'autre an apries que chil preudom Fouques parla de diu ,%,%[punctelev] ot .i. tournoi en champaigne ,%% | ||
a .i. castiel qui a non Aicri .%,%. et par la grace de diu si auint ke Thiebaus ,%% quens de champaigne | ||
et de Brie ,%% prist la crois ,%,%. et li cuens Looys de Bloys %,%. et de chartaing .%.%. | ||
Et che fu a l'entree des Auens .%,%. et chil cuens thiebaus estoit iouenes hom et n'auoit pas plus de .xxij. ans ,%.%. Ne li cuens Looys n'auoit pas plus de .xxvij. ans .%,%. Chil doi conte ierent neueu le roi de france %,%. et cousin germain et neueu le roi d'engleterre %.%. De l'autre part .%% auoec ces .ij. contes se croisierent doi moult haut baron de france ,%.%. Symons de Montfort %,%. et Renaus de Mommirail .%.%. Moult fu grans la renommee par les terres .%,%[punctelev] quant cil doi se croisierent .§%.§%. | ||
EN la terre le conte de champaigne se croisa Gerniers li euesques de Troies ,%,%. et li cuens Gautiers de Braine ,%.%. Joffrois de Joinuile ,%,%. | ||
qui estoit senescaus de la tiere ,%.%. | ||
Robiers ses freres ,%.%. Gautiers de voignori ,%.%. Gautiers de Mombelyart ,%.%. | ||
Eustasces d'escouflans ,%.%. Guis dou plaissie %,%. et ses freres ,%% Henris D'ardillieres ,%.%. Ogiers de saint chienon ,%.%.""".replace( | ||
"%", "").replace("\n", " ").replace(" ", "") | ||
|
||
print(input_text) | ||
|
||
from boudams.tagger import Seq2SeqTokenizer | ||
import logging | ||
|
||
logger = logging.getLogger() | ||
logger.setLevel(logging.DEBUG) | ||
|
||
tokenizer = Seq2SeqTokenizer.load("/home/thibault/dev/boudams/models/linear-conv2019-05-24--14:08:58-0.0001.tar", device="cpu") | ||
print("".join(tokenizer.annotate_text(input_text))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "l", "4": "r", "6": "u", "7": "g", "8": "x", "9": "c", "10": "]", "11": "z", "12": " ", "13": "j", "14": ";", "15": "e", "16": "f", "17": "a", "18": "[", "19": "k", "20": "h", "21": "?", "22": "y", "23": "!", "24": "w", "25": "v", "26": "-", "27": ")", "28": "m", "29": "s", "30": "q", "31": "d", "32": "i", "33": "t", "34": "'", "35": ",", "36": ".", "37": "(", "38": "n", "39": "b", "40": "p", "41": ":", "42": "o"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, "r": 4, "l": 5, "u": 6, "g": 7, "x": 8, "c": 9, "]": 10, "z": 11, " ": 12, "j": 13, ";": 14, "e": 15, "f": 16, "a": 17, "[": 18, "k": 19, "h": 20, "?": 21, "y": 22, "!": 23, "w": 24, "v": 25, "-": 26, ")": 27, "m": 28, "s": 29, "q": 30, "d": 31, "i": 32, "t": 33, "'": 34, ",": 35, ".": 36, "(": 37, "n": 38, "b": 39, "p": 40, ":": 41, "o": 42}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}} | ||
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "r", "4": "!", "6": " ", "7": "q", "8": "t", "9": "p", "10": "v", "11": "x", "12": ";", "13": "l", "14": "n", "15": ".", "16": "s", "17": "g", "18": ")", "19": "o", "20": "z", "21": "e", "22": "k", "23": "(", "24": "d", "25": ":", "26": "a", "27": "?", "28": "u", "29": "b", "30": "j", "31": "i", "32": "'", "33": "y", "34": "f", "35": "m", "36": "w", "37": "c", "38": "]", "39": "[", "40": "h", "41": ",", "42": "-"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, "!": 4, "r": 5, " ": 6, "q": 7, "t": 8, "p": 9, "v": 10, "x": 11, ";": 12, "l": 13, "n": 14, ".": 15, "s": 16, "g": 17, ")": 18, "o": 19, "z": 20, "e": 21, "k": 22, "(": 23, "d": 24, ":": 25, "a": 26, "?": 27, "u": 28, "b": 29, "j": 30, "i": 31, "'": 32, "y": 33, "f": 34, "m": 35, "w": 36, "c": 37, "]": 38, "[": 39, "h": 40, ",": 41, "-": 42}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}} |