diff --git a/CustomVGSL.md b/CustomVGSL.md index c7e918a..c3fd99c 100644 --- a/CustomVGSL.md +++ b/CustomVGSL.md @@ -6,11 +6,13 @@ The new spec system is built around custom architecture strings. Available modules: -- `C[A],` uses a convolutional layer where `x` is the n-gram window and `d` the output. -- `CP[A],` uses a convolutional layer with positional embeddings where `x` is the n-gram window and `d` the output. -- `L[A],` uses a Bi-LSTM layer where `h` is the hidden size and `l` the number of layers. -- `G[A],` uses a Bi-GRU layer where `h` is the hidden size and `l` the number of layers. -- `D` uses a Dropout layer with a rate of `r` +- `C,[,

]` uses a convolutional layer where `x` is the n-gram window and `d` the output. `p` is an optional padding. +- `CS[s],,[,Do]` uses a sequential convolutional layer where `x` is the "n-gram window", `d` the output, `l` the number. Can have an optional `Dropout` rate between each convolution. +of layers. `[s]` (`CSs`) use a final addition of conved output + original input with a scale +- `P[l]` adds a positional embeddings with an optional linear activation (eg. `Pl`). +- `L,` uses a Bi-LSTM layer where `h` is the hidden size and `l` the number of layers. +- `G,` uses a Bi-GRU layer where `h` is the hidden size and `l` the number of layers. +- `Do` uses a Dropout layer with a rate of `r` - `L` uses a Linear layer of dimension `d` `[A]` can be replaced with an activation layer, such as: @@ -21,8 +23,15 @@ Available modules: - `l` = linear (i.e., No non-linearity) - `m` = softmax - `n` = n/a +- `g` = GLU The VGSL module must starts with an embedding size: `E`. Example: `[E200 L120 L200 Cr3,10 D3]` will use a Convolutional Layer of (3 ngram for 10 of dim) and a relu activation over which 30% of dropout is applied before classification + +## Legacy architectures + +- ConvPos `[E256 Pl Do.3 CS5,256,10,Do.25 L256]` +- ConvNoPos `[E256 Do.3 CS5,256,10,Do.25 L256]` +- Gru `[E256 Do.3 CSs5,256,10Do.25 L256]` \ No newline at end of file diff --git a/boudams/cli.py b/boudams/cli.py index b0f1238..48d3d6f 100644 --- a/boudams/cli.py +++ b/boudams/cli.py @@ -180,11 +180,18 @@ def template(filename): @cli.command("train") -@click.argument("config_files", nargs=-1, type=click.File("r")) +@click.argument("train-set", type=click.Path(file_okay=True, exists=True, dir_okay=False)) +@click.argument("dev-set", type=click.Path(file_okay=True, exists=True, dir_okay=False)) +@click.argument("test-set", type=click.Path(file_okay=True, exists=True, dir_okay=False)) +@click.argument("output", type=click.Path(dir_okay=False, exists=False)) +@click.option("--architecture", type=str, help="VGSL-Like architecture.", + default="[E256 Pl Do.3 CSs5,256,10Do.25 L256]", show_default=True) @click.option("--mode", type=click.Choice(_POSSIBLE_MODES), default="simple-space", show_default=True, help="Type of encoder you want to set-up") -@click.option("--output", type=click.Path(dir_okay=False, exists=False), default=None, help="Model Name") +@click.option("--normalize", type=bool, is_flag=True, default=False, help="Normalize string input with unidecode" + " or mufidecode") +@click.option("--lower", type=bool, is_flag=True, default=False, help="Lower strings") @click.option("--epochs", type=int, default=100, help="Number of epochs to run") @click.option("--batch_size", type=int, default=32, help="Size of batches") @click.option("--device", default="cpu", help="Device to use for the network (cuda:0, cpu, etc.)") @@ -194,21 +201,35 @@ def template(filename): @click.option("--metric", default="f1", type=click.Choice(ACCEPTABLE_MONITOR_METRICS), help="Metric to monitor") @click.option("--avg", default="macro", type=click.Choice(["micro", "macro"]), help="Type of avering method to use on " "metrics") +@click.option("--lr", default=.0001, type=float, help="Learning rate", + show_default=True) @click.option("--delta", default=.001, type=float, help="Minimum change in the monitored quantity to qualify as an " - "improvement") -@click.option("--patience", default=3, type=int, help="Number of checks with no improvement after which training " - "will be stopped") + "improvement", + show_default=True) +@click.option("--patience", default=5, type=int, help="Number of checks with no improvement after which training " + "will be stopped", + show_default=True) +@click.option("--lr-patience", default=3, type=int, help="Number of checks with no improvement for lowering LR", + show_default=True) +@click.option("--shuffle/--no-shuffle", type=bool, is_flag=True, default=True, + help="Suppress the shuffling of datasets", show_default=True) +@click.option("--lr-factor", default=.5, type=float, help="Ratio for lowering LR", show_default=True) @click.option("--seed", default=None, type=int, help="Runs deterministic training") @click.option("--optimizer", default="Adams", type=click.Choice(["Adams"]), help="Optimizer to use") # ToDo: Figure out the bug with Ranger # pytorch_lightning.utilities.exceptions.MisconfigurationException: The closure hasn't been executed. HINT: did you call # `optimizer_closure()` in your `optimizer_step` hook? It could also happen because the # `optimizer.step(optimizer_closure)` call did not execute it internally. -def train(config_files: List[click.File], output: str, mode: str, - epochs: int, batch_size: int, device: str, debug: bool, workers: int, - auto_lr: bool, - metric: str, avg: str, delta: float, patience: int, - seed: int, optimizer: str): +def train( + train_set: str, dev_set: str, test_set: str, + architecture: str, output: str, mode: str, + normalize: bool, lower: bool, + epochs: int, batch_size: int, device: str, debug: bool, workers: int, + auto_lr: bool, + metric: str, avg: str, + lr: float, delta: float, patience: int, + lr_patience: int, lr_factor: float, + seed: int, optimizer: str, shuffle: bool): """ Train one or more models according to [CONFIG_FILES] JSON configurations""" if debug: logger.setLevel(logging.DEBUG) @@ -218,100 +239,99 @@ def train(config_files: List[click.File], output: str, mode: str, if seed: pl.seed_everything(seed, workers=True) + device = device.lower() if device == 'cpu': device = None elif device.startswith('cuda'): device = [int(device.split(':')[-1])] + else: + click.echo(click.style("Device is invalid. Either use `cpu` or `cuda:0`, `cuda:1`", fg="red")) + return - for config_file in config_files: - config = json.load(config_file) - - train_path, dev_path, test_path = config["datasets"]["train"],\ - config["datasets"]["dev"],\ - config["datasets"]["test"] - - vocabulary = LabelEncoder( - maximum_length=config.get("max_sentence_size", None), - mode=mode, - remove_diacriticals=config["label_encoder"].get("normalize", True), - lower=config["label_encoder"].get("lower", True) - ) - vocabulary.build(train_path, dev_path, test_path, debug=True) - if debug: - from pprint import pprint - pprint(vocabulary.mtoi) - - # Get the datasets - train_dataset: BoudamsDataset = vocabulary.get_dataset(train_path) - dev_dataset: BoudamsDataset = vocabulary.get_dataset(dev_path) - test_dataset: BoudamsDataset = vocabulary.get_dataset(test_path) - - logger.info("Training %s " % config_file.name) - logger.info("-- Dataset informations --") - logger.info(f"Number of training examples: {len(train_dataset)}") - logger.info(f"Number of dev examples: {len(dev_dataset)}") - logger.info(f"Number of testing examples: {len(test_dataset)}") - logger.info(f"Vocabulary Size: {len(vocabulary)}") - logger.info("--------------------------") + train_path, dev_path, test_path = train_set, dev_set, test_set - tagger = BoudamsTagger( - vocabulary, - system=config["model"], - out_max_sentence_length=config.get("max_sentence_size", None), - metric_average=avg, - optimizer=OptimizerParams( - optimizer, - kwargs={"lr": config["learner"]["lr"]}, - scheduler={ - "patience": config["learner"].get("lr_patience", None), - "factor": config["learner"].get("lr_factor", None), - "threshold": delta - } - ), - **config["network"] - ) - trainer = Trainer( - gpus=device, - patience=patience, - min_delta=delta, - monitor=metric, - max_epochs=epochs, - gradient_clip_val=1, - model_name=output or (config["name"] + str(datetime.datetime.today()).replace(" ", "--").split(".")[0]), - # n_epochs=epochs, - auto_lr_find=auto_lr, - deterministic=True if seed else False + vocabulary = LabelEncoder( + mode=mode, + remove_diacriticals=normalize, + lower=lower + ) + maximum_sentence_size = vocabulary.build(train_path, dev_path, test_path, debug=True) + if debug: + from pprint import pprint + pprint(vocabulary.mtoi) + + # Get the datasets + train_dataset: BoudamsDataset = vocabulary.get_dataset(train_path) + dev_dataset: BoudamsDataset = vocabulary.get_dataset(dev_path) + test_dataset: BoudamsDataset = vocabulary.get_dataset(test_path) + + logger.info("Architecture %s " % architecture) + logger.info("-- Dataset informations --") + logger.info(f"Number of training examples: {len(train_dataset)}") + logger.info(f"Number of dev examples: {len(dev_dataset)}") + logger.info(f"Number of testing examples: {len(test_dataset)}") + logger.info(f"Vocabulary Size: {len(vocabulary)}") + logger.info("--------------------------") + + tagger = BoudamsTagger( + vocabulary, + architecture=architecture, + maximum_sentence_size=maximum_sentence_size, + metric_average=avg, + optimizer=OptimizerParams( + optimizer, + kwargs={"lr": lr}, + scheduler={ + "patience": lr_patience, + "factor": lr_factor, + "threshold": delta + } ) - train_dataloader, dev_dataloader = ( - DataLoader( - train_dataset, - batch_size=batch_size, - shuffle=config["datasets"].get("random", True), - collate_fn=train_dataset.train_collate_fn, - num_workers=workers - ), - DataLoader( - dev_dataset, - batch_size=batch_size, - shuffle=config["datasets"].get("random", True), - collate_fn=dev_dataset.train_collate_fn, - num_workers=workers - ) + ) + trainer = Trainer( + gpus=device, + patience=patience, + min_delta=delta, + monitor=metric, + max_epochs=epochs, + gradient_clip_val=1, + model_name=output, + # n_epochs=epochs, + auto_lr_find=auto_lr, + deterministic=True if seed else False + ) + train_dataloader, dev_dataloader = ( + DataLoader( + train_dataset, + batch_size=batch_size, + shuffle=shuffle, + collate_fn=train_dataset.train_collate_fn, + num_workers=workers + ), + DataLoader( + dev_dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=dev_dataset.train_collate_fn, + num_workers=workers ) - if auto_lr: - trainer.tune(tagger, train_dataloader, dev_dataloader) - return - trainer.fit(tagger, train_dataloader, dev_dataloader) + ) - trainer.test( - tagger, - DataLoader( - test_dataset, - batch_size=batch_size, - collate_fn=test_dataset.train_collate_fn, - num_workers=workers - ) + if auto_lr: + trainer.tune(tagger, train_dataloader, dev_dataloader) + return + trainer.fit(tagger, train_dataloader, dev_dataloader) + + trainer.test( + tagger, + DataLoader( + test_dataset, + batch_size=batch_size, + collate_fn=test_dataset.train_collate_fn, + num_workers=workers, + shuffle=False ) + ) @cli.command("test") diff --git a/boudams/encoder.py b/boudams/encoder.py index 24be08b..f4512b4 100644 --- a/boudams/encoder.py +++ b/boudams/encoder.py @@ -76,11 +76,11 @@ def mode(self): def __len__(self): return len(self.stoi) - def build(self, *paths, debug=False): + def build(self, train, *paths, debug=False) -> int: """ Builds vocabulary :param paths: Path of file to read - :return: + :return: Maximum sentence size """ recorded_chars = set() counter = None @@ -88,10 +88,14 @@ def build(self, *paths, debug=False): counter = collections.Counter() logging.info("Reading files for vocabulary building") - for path in paths: + max_sentence_size = 0 + for path_idx, path in enumerate([train, *paths]): with open(path) as fio: for line in fio.readlines(): x, _ = self.readunit(line) + seq_len = len(x) + if seq_len > max_sentence_size: + max_sentence_size = seq_len recorded_chars.update(set(list(x))) logging.info("Saving {} chars to label encoder".format(len(recorded_chars))) @@ -102,6 +106,8 @@ def build(self, *paths, debug=False): # Reuse index for string retrieval self.itos[self.stoi[char]] = char + return max_sentence_size + def readunit(self, line) -> Tuple[str, str]: """ Read a single line diff --git a/boudams/model/__init__.py b/boudams/model/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/boudams/model/bidir.py b/boudams/model/bidir.py deleted file mode 100644 index 9b41501..0000000 --- a/boudams/model/bidir.py +++ /dev/null @@ -1,43 +0,0 @@ -import torch.nn as nn - - -class Encoder(nn.Module): - def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout): - super().__init__() - - self.input_dim = input_dim - self.emb_dim = emb_dim - self.hid_dim = hid_dim - self.n_layers = n_layers - self.dropout = dropout - - self.embedding = nn.Embedding(input_dim, emb_dim) - - self.rnn = nn.GRU( - emb_dim, hid_dim, n_layers, - dropout=dropout, bidirectional=True, batch_first=True - ) - - self.dropout = nn.Dropout(dropout) - - @property - def output_dim(self): - return self.hid_dim * 2 - - def forward(self, src): - - # src = [src sent len, batch size] - embedded = self.dropout(self.embedding(src)) - - # embedded = [src sent len, batch size, emb dim] - - # packed_outputs = [src sent len, batch size, hid dim * n directions] - # hidden = [n layers * n directions, batch size, hid dim] - output, hidden = self.rnn(embedded) - - return output, hidden - - def init_weights(self): - for name, param in self.named_parameters(): - nn.init.uniform_(param.data, -0.08, 0.08) - diff --git a/boudams/model/conv.py b/boudams/model/conv.py deleted file mode 100644 index fdb0f06..0000000 --- a/boudams/model/conv.py +++ /dev/null @@ -1,94 +0,0 @@ -# https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Encoder(nn.Module): - def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, - max_sentence_len: int = 100): - super().__init__() - - assert kernel_size % 2 == 1, "Kernel size must be odd!" - - self.input_dim = input_dim - self.emb_dim = emb_dim - self.hid_dim = hid_dim - self.kernel_size = kernel_size - self.dropout = dropout - self.max_sentence_len = max_sentence_len - - self.scale = torch.sqrt(torch.FloatTensor([0.5])) - - self.tok_embedding = nn.Embedding(input_dim, emb_dim) - self.pos_embedding = nn.Embedding(max_sentence_len, emb_dim) - - self.emb2hid = nn.Linear(emb_dim, hid_dim) - self.hid2emb = nn.Linear(hid_dim, emb_dim) - - self.convs = nn.ModuleList([nn.Conv1d(in_channels=hid_dim, - out_channels=2 * hid_dim, - kernel_size=kernel_size, - padding=(kernel_size - 1) // 2) - for _ in range(n_layers)]) - - self.dropout = nn.Dropout(dropout) - - def forward(self, src): - # create position tensor - - # pos = [src sent len, batch size] (Not what is documented) - pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).type_as(src) - - # embed tokens and positions - tok_embedded = self.tok_embedding(src) - pos_embedded = self.pos_embedding(pos) - - # tok_embedded = pos_embedded = [batch size, src sent len, emb dim] - - # combine embeddings by elementwise summing - embedded = self.dropout(tok_embedded + pos_embedded) - - # embedded = [batch size, src sent len, emb dim] - - # pass embedded through linear layer to go through emb dim -> hid dim - conv_input = self.emb2hid(embedded) - - # conv_input = [batch size, src sent len, hid dim] - - - # permute for convolutional layer - conv_input = conv_input.permute(0, 2, 1) - - # conv_input = [batch size, hid dim, src sent len] - self.scale = self.scale.type_as(conv_input) - for i, conv in enumerate(self.convs): - # pass through convolutional layer - conved = conv(self.dropout(conv_input)) - - # conved = [batch size, 2*hid dim, src sent len] - - # pass through GLU activation function - conved = F.glu(conved, dim=1) - - # conved = [batch size, hid dim, src sent len] - - # apply residual connection - conved = (conved + conv_input) * self.scale - - # conved = [batch size, hid dim, src sent len] - - # set conv_input to conved for next lo`op iteration - conv_input = conved - - # permute and convert back to emb dim - conved = self.hid2emb(conved.permute(0, 2, 1)) - - # conved = [batch size, src sent len, emb dim] - - # elementwise sum output (conved) and input (embedded) to be used for attention - combined = (conved + embedded) * self.scale - - # combined = [batch size, src sent len, emb dim] - return conved, combined diff --git a/boudams/model/linear.py b/boudams/model/linear.py deleted file mode 100644 index 9e5e5ab..0000000 --- a/boudams/model/linear.py +++ /dev/null @@ -1,206 +0,0 @@ -# https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -from typing import Optional, List - - -from .conv import Encoder as CNNEncoder -from .lstm import Encoder as LSTMEncoder -from .bidir import Encoder as BiGruEncoder - - -class LinearEncoderCNN(CNNEncoder): - def forward(self, src, keep_pos=False): - o, p = super(LinearEncoderCNN, self).forward(src) - if keep_pos: - return p - return o - - -class LinearLSTMEncoder(LSTMEncoder): - """ Linear - version of the LSTMEncoder """ - - -class LinearEncoderCNNNoPos(nn.Module): - def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout): - super().__init__() - - assert kernel_size % 2 == 1, "Kernel size must be odd!" - - self.input_dim = input_dim - self.emb_dim = emb_dim - self.hid_dim = hid_dim - self.kernel_size = kernel_size - self.dropout = dropout - - self.scale = torch.sqrt(torch.FloatTensor([0.5])) - - self.tok_embedding = nn.Embedding(input_dim, emb_dim) - - self.emb2hid = nn.Linear(emb_dim, hid_dim) - self.hid2emb = nn.Linear(hid_dim, emb_dim) - - self.convs = nn.ModuleList([nn.Conv1d(in_channels=hid_dim, - out_channels=2 * hid_dim, - kernel_size=kernel_size, - padding=(kernel_size - 1) // 2) - for _ in range(n_layers)]) - - self.dropout = nn.Dropout(dropout) - - def forward(self, src): - # create position tensor - - # pos = [src sent len, batch size] (Not what is documented) - pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1) - - # embed tokens and positions - tok_embedded = self.tok_embedding(src) - - # tok_embedded = pos_embedded = [batch size, src sent len, emb dim] - - # combine embeddings by elementwise summing - embedded = self.dropout(tok_embedded) - - # embedded = [batch size, src sent len, emb dim] - - # pass embedded through linear layer to go through emb dim -> hid dim - conv_input = self.emb2hid(embedded) - - # conv_input = [batch size, src sent len, hid dim] - - - # permute for convolutional layer - conv_input = conv_input.permute(0, 2, 1) - - # conv_input = [batch size, hid dim, src sent len] - - self.scale = self.scale.type_as(conv_input) - - for i, conv in enumerate(self.convs): - # pass through convolutional layer - conved = conv(self.dropout(conv_input)) - - # conved = [batch size, 2*hid dim, src sent len] - - # pass through GLU activation function - conved = F.glu(conved, dim=1) - - # conved = [batch size, hid dim, src sent len] - - # apply residual connection - conved = (conved + conv_input) * self.scale - - # conved = [batch size, hid dim, src sent len] - - # set conv_input to conved for next lo`op iteration - conv_input = conved - - # permute and convert back to emb dim - conved = self.hid2emb(conved.permute(0, 2, 1)) - - # conved = [batch size, src sent len, emb dim] - - # combined = [batch size, src sent len, emb dim] - return conved - - -class LinearDecoder(nn.Module): - """ - Simple Linear Decoder that outputs a probability distribution - over the vocabulary - Parameters - =========== - label_encoder : LabelEncoder - in_features : int, input dimension - """ - def __init__(self, enc_dim, out_dim, highway_layers=0, highway_act='relu'): - super().__init__() - self.out_dim = out_dim - # highway - self.highway = None - # decoder output - self.decoder = nn.Linear(enc_dim, out_dim) - - self.relu = True - - def forward(self, enc_outs): - if self.highway is not None: - enc_outs = self.highway(enc_outs) - - return self.decoder(enc_outs) - - -class MainModule(nn.Module): - masked_only = True - - def __init__( - self, - encoder: CNNEncoder, decoder: LinearDecoder, - pad_idx: int, - pos: bool = False, - **kwargs - ): - super().__init__() - - self.encoder = encoder - self.decoder: LinearDecoder = decoder - self.pos = pos - - self.pad_idx = pad_idx - - # nll weight - nll_weight = torch.ones(decoder.out_dim) - nll_weight[pad_idx] = 0. - self.register_buffer('nll_weight', nll_weight) - - def forward(self, src, src_len, trg=None, **kwargs): - # src = [batch size, src sent len] - # trg = [batch size, trg sent len] - - # calculate z^u (encoder_conved) and e (encoder_combined) - # encoder_conved is output from final encoder conv. block - # encoder_combined is encoder_conved plus (elementwise) src embedding plus positional embeddings - if isinstance(self.encoder, LinearEncoderCNN): - second_step = self.encoder(src, keep_pos=True) - elif isinstance(self.encoder, LinearEncoderCNNNoPos): - second_step = self.encoder(src) - elif isinstance(self.encoder, LinearLSTMEncoder): - second_step, hidden, cell = self.encoder(src.t()) - # -> tensor(sentence size, batch size, hid dim * n directions) - elif isinstance(self.encoder, BiGruEncoder): - second_step, hidden = self.encoder(src) - # -> tensor(sentence size, batch size, hid dim * n directions) - # second_step = second_step.transpose(1, 0) - # -> tensor(batch size, sentence size, hid dim * n directions) - else: - raise AttributeError("The encoder is not recognized.") - - output = self.decoder(second_step) - return output - - def predict( - self, - src, - src_len, - label_encoder: "LabelEncoder", - override_src: Optional[List[str]] = None - ) -> torch.Tensor: - """ Predicts value for a given tensor - - :param src: tensor(batch size x sentence_length) - :param src_len: tensor(batch size) - :param label_encoder: Encoder - :return: Reversed Batch - """ - out = self(src, src_len, None, teacher_forcing_ratio=0) - logits = torch.argmax(out, -1) - return label_encoder.reverse_batch( - input_batch=src, - mask_batch=logits - ) diff --git a/boudams/model/lstm.py b/boudams/model/lstm.py deleted file mode 100644 index 5dc45eb..0000000 --- a/boudams/model/lstm.py +++ /dev/null @@ -1,45 +0,0 @@ -import torch.nn as nn - - -class Encoder(nn.Module): - def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout): - super().__init__() - - self.input_dim = input_dim - self.emb_dim = emb_dim - self.hid_dim = hid_dim - self.n_layers = n_layers - self.dropout = dropout - - self.embedding = nn.Embedding(input_dim, emb_dim) - - self.rnn = nn.LSTM( - emb_dim, hid_dim, n_layers, - dropout=dropout, bidirectional=True, batch_first=True - ) - - self.dropout = nn.Dropout(dropout) - - @property - def output_dim(self): - return 2 * self.hid_dim - - def forward(self, src): - - # src = [src sent len, batch size] - # ToDo: Check PackPadded given the results ? - embedded = self.dropout(self.embedding(src)) - - # embedded = [src sent len, batch size, emb dim] - - # packed_outputs = [src sent len, batch size, hid dim * n directions] - # hidden = [n layers * n directions, batch size, hid dim] - # cell = [n layers * n directions, batch size, hid dim] - output, (hidden, cell) = self.rnn(embedded) - - return output, hidden, cell - - def init_weights(self): - for name, param in self.named_parameters(): - nn.init.uniform_(param.data, -0.08, 0.08) - diff --git a/boudams/modes.py b/boudams/modes.py index 4444206..d5f3388 100644 --- a/boudams/modes.py +++ b/boudams/modes.py @@ -30,10 +30,9 @@ def __init__(self, masks: Dict[str, int] = None): DEFAULT_MASK_TOKEN: 1, DEFAULT_WB_TOKEN: 2 } - self.index_to_mask: Dict[str, int] = masks or { - 0: DEFAULT_PAD_TOKEN, - 1: DEFAULT_MASK_TOKEN, - 2: DEFAULT_WB_TOKEN + self.index_to_mask: Dict[str, int] = { + value: key + for value, key in self.masks_to_index.items() } self.index_to_masks_name: Dict[int, str] = { 0: "PAD", @@ -80,12 +79,12 @@ def generate_mask( :return: >>> (SimpleSpaceMode()).generate_mask("j'ai un cheval") - ('xxx|x|xxxxx|', "j'aiuncheval") + ("j'aiuncheval", '---|-|-----|') """ - split = self._space.split(string) + split = self._space.split(string.strip()) masks = DEFAULT_WB_TOKEN.join([DEFAULT_MASK_TOKEN * (len(tok)-1) for tok in split]) + DEFAULT_WB_TOKEN model_input = "".join(split) - assert len(masks) == len(model_input), f"Length of input and mask should be equal `{masks}` + `{model_input}`" + assert len(masks) == len(model_input), f"Length of input and mask should be equal `{masks}` + `{model_input}` + `{string}`" return model_input, masks def encode_mask(self, masked_string: Sequence[str]) -> List[int]: diff --git a/boudams/modules/__init__.py b/boudams/modules/__init__.py new file mode 100644 index 0000000..f52418a --- /dev/null +++ b/boudams/modules/__init__.py @@ -0,0 +1,311 @@ +from typing import Optional, Callable, Tuple +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.utils.rnn as rnn_utils + + +__all__ = [ + "ModelWrapper", + "PosEmbedding", + "Conv", "SequentialConv", + "Dropout", + "BiLSTM", "BiGru", + "Linear" +] + + +class ModelWrapper(nn.Module): + def __init__(self, input_dim: int, use_positional: bool = False): + super(ModelWrapper, self).__init__() + self._input_dim: int = input_dim + self._output_dim: int = 0 + self._nn: nn.Module = nn.Module() + self._use_positional: bool = use_positional + + @property + def input_dim(self): + return self._input_dim + + @property + def output_dim(self) -> int: + return self._output_dim + + def _forward( + self, + inp: torch.Tensor, + inp_length: Optional[torch.Tensor] = None + ) -> torch.Tensor: + raise NotImplementedError() + + def forward(self, + inp: torch.Tensor, + inp_length: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + return self._forward(inp, inp_length), inp_length + + def init_weights(self): + return + + +class Dropout(ModelWrapper): + def __init__( + self, + input_dim, + rate: float + ): + super(Dropout, self).__init__(input_dim=input_dim) + self._rate = rate + self._output_dim = input_dim + self._nn = nn.Dropout(self._rate) + + def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor: + return self._nn(inp) + + +class Linear(ModelWrapper): + def __init__( + self, + input_dim, + output_dim: int + ): + super(Linear, self).__init__(input_dim=input_dim, use_positional=False) + self._nn = nn.Linear(in_features=input_dim, out_features=output_dim) + # Directions*Hidden_Dim + self._output_dim = output_dim + + def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor: + return self._nn(inp) + + +class Conv(ModelWrapper): + """ + LICENSE: https://github.com/allenai/allennlp/blob/master/LICENSE + """ + def __init__( + self, + input_dim, + out_filters: int, + filter_size: int, + padding_size: Optional[int] = 0, + activation: Optional[str] = "g" + ): + super(Conv, self).__init__(input_dim=input_dim, use_positional=False) + self._padding_size = padding_size + self._nn = nn.Conv1d( + in_channels=input_dim, + out_channels=out_filters if activation != "g" else 2*out_filters, + kernel_size=(filter_size, ), + padding=padding_size + ) + self._output_dim = out_filters + + if activation == "g": + self.activation: Callable[[torch.Tensor], torch.Tensor] = lambda conv_output: F.glu(conv_output, dim=1) + + def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor: + # Input is (batch_size, sequence_length, dimension) + # The convolution layers expect input of shape `(batch_size, in_channels, sequence_length) + # We permute last two dimensions + + # conv_input = [batch size, encoding dim, src sent len] + conv_input = inp.permute(0, 2, 1) + + # If we have glu, out_dim is 2*out_filters then out_filters at activation time + # Otherwise, always out_filters + # conved = [batch size, 2*out_filters, src sent len] + conved = self._nn(conv_input) + return self.activation(conved).permute(0, 2, 1) + + +class PosEmbedding(ModelWrapper): + """ + LICENSE: https://github.com/allenai/allennlp/blob/master/LICENSE + """ + def __init__( + self, + input_dim, + maximum_sentence_size: int, + padding_size: Optional[int] = 0, + activation: Optional[str] = None + ): + super(PosEmbedding, self).__init__(input_dim=input_dim, use_positional=False) + self._padding_size = padding_size + self._output_dim = self._input_dim + + self._nn: nn.Embedding = nn.Embedding(maximum_sentence_size, input_dim) + self.activation: Optional[nn.Linear] = None + if activation: + self.activation = nn.Linear(input_dim, input_dim) + + def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor: + # Input is (batch_size, sequence_length, dimension) + # The convolution layers expect input of shape `(batch_size, in_channels, sequence_length) + # We permute last two dimensions + + pos = torch.arange(0, inp.shape[1], device=inp.device).unsqueeze(0).repeat(inp.shape[0], 1) + + inp = inp + self._nn(pos) + if self.activation is not None: + return self.activation(inp) + return inp + + +class BiLSTM(ModelWrapper): + def __init__( + self, + input_dim, + hidden_dim: int, + padding_idx = 0, + layers: int = 1 + ): + super(BiLSTM, self).__init__(input_dim=input_dim, use_positional=False) + self._nn = nn.LSTM( + input_size=input_dim, + hidden_size=hidden_dim, + num_layers=layers, + bidirectional=True, + batch_first=True + ) + # Directions*Hidden_Dim + self._output_dim = 2*hidden_dim + self._padding_idx: int = padding_idx + + def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor: + + # inp = [src sent len, batch size, emb dim] + # packed_outputs = [src sent len, batch size, hid dim * n directions] + inp = rnn_utils.pack_padded_sequence(inp, lengths=inp_length.cpu(), batch_first=True) + output, _ = self._nn(inp) + return rnn_utils.pad_packed_sequence( + output, + padding_value=self._padding_idx, + batch_first=True + )[0] + + def init_weights(self): + for name, param in self.named_parameters(): + nn.init.uniform_(param.data, -0.08, 0.08) + + +class BiGru(ModelWrapper): + def __init__( + self, + input_dim, + hidden_dim: int, + layers: int = 1, + padding_idx = 0 + ): + super(BiGru, self).__init__(input_dim=input_dim, use_positional=False) + self._nn = nn.GRU( + input_size=input_dim, + hidden_size=hidden_dim, + num_layers=layers, + bidirectional=True, + batch_first=True + ) + # Directions*Hidden_Dim + self._output_dim = 2 * hidden_dim + self._padding_idx: int = padding_idx + + def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor: + + # inp = [src sent len, batch size, emb dim] + # packed_outputs = [src sent len, batch size, hid dim * n directions] + + inp = rnn_utils.pack_padded_sequence(inp, lengths=inp_length.cpu(), batch_first=True) + output, _ = self._nn(inp) + return rnn_utils.pad_packed_sequence( + output, + padding_value=self._padding_idx, + batch_first=True + )[0] + + def init_weights(self): + for name, param in self.named_parameters(): + nn.init.uniform_(param.data, -0.08, 0.08) + + +class SequentialConv(ModelWrapper): + # https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb + def __init__( + self, + input_dim, + filter_dim, + n_layers, + filter_size, + use_sum: Optional[str] = None, + dropout: Optional[float] = None + ): + super().__init__(input_dim=input_dim) + + assert filter_size % 2 == 1, "Filter size must be odd!" + + self._output_dim = input_dim + self._filter_size = filter_size + + self._scale = torch.sqrt(torch.FloatTensor([0.5])) + self._inp_to_filter = nn.Linear(input_dim, filter_dim) + self._filter_to_inp = nn.Linear(filter_dim, input_dim) + + self._nns = nn.ModuleList([ + nn.Conv1d(in_channels=filter_dim, out_channels=2 * filter_dim, + kernel_size=filter_size, padding=(filter_size - 1) // 2) + for _ in range(n_layers) + ]) + + self._dropout: Optional[nn.Dropout] = None + if dropout: + self._dropout = nn.Dropout(dropout) + + self._use_sum: Optional[str] = use_sum + + def dropout(self, x): + if self._dropout is not None: + return self._dropout(x) + return x + + def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor: + # pass embedded through linear layer to go through emb dim -> hid dim + # conv_input = [batch size, src sent len, hid dim] + conv_input = self._inp_to_filter(inp) + + # permute for convolutional layer + conv_input = conv_input.permute(0, 2, 1) + + # conv_input = [batch size, hid dim, src sent len] + self._scale = self._scale.type_as(conv_input) + + conved: Optional[torch.Tensor] = None + + for i, conv in enumerate(self._nns): + # pass through convolutional layer + conved = conv(self.dropout(conv_input)) + + # conved = [batch size, 2*hid dim, src sent len] + + # pass through GLU activation function + conved = F.glu(conved, dim=1) + + # conved = [batch size, hid dim, src sent len] + + # apply residual connection + conved = (conved + conv_input) * self._scale + + # conved = [batch size, hid dim, src sent len] + + # set conv_input to conved for next lo`op iteration + conv_input = conved + + # permute and convert back to emb dim + conved = self._filter_to_inp(conved.permute(0, 2, 1)) + + if self._use_sum: + # conved = [batch size, src sent len, emb dim] + + # elementwise sum output (conved) and input (embedded) to be used for attention + combined = (conved + inp) * self._scale + + # combined = [batch size, src sent len, emb dim] + return combined + return conved diff --git a/boudams/tagger.py b/boudams/tagger.py index 663d621..84be004 100644 --- a/boudams/tagger.py +++ b/boudams/tagger.py @@ -7,13 +7,9 @@ import logging import regex as re from dataclasses import dataclass +from collections import OrderedDict from typing import List, Any, Optional, Dict, ClassVar, Tuple -from boudams.model import linear -from boudams import utils - -from .encoder import LabelEncoder - import pytorch_lightning as pl import torch.nn as nn import torch.optim as optim @@ -21,6 +17,9 @@ import torchmetrics from boudams.utils import improvement_on_min_or_max +from boudams.modules import * +from boudams import utils +from boudams.encoder import LabelEncoder teacher_forcing_ratio = 0.5 @@ -28,6 +27,31 @@ MAX_LENGTH = 150 +_re_embedding = re.compile(r"E(\d+)") +_re_conv = re.compile(r"C(\d+),(\d+)(?:,(\d+))?") +_re_sequential_conv = re.compile(r"CS(s)?(\d+),(\d+),(\d+)(?:,Do(0?\.\d+))") +_re_pos = re.compile(r"P(l)?") +_re_bilstm = re.compile(r"L(\d+),(\d+)") +_re_bigru = re.compile(r"G(\d+),(\d+)") +_re_linear = re.compile(r"L(\d+)") +_re_dropout = re.compile(r"Do(0?\.\d+)") + + +def _map_params(iterable): + def weird_float(x): + if x.startswith("."): + return x[1:].isnumeric() + return x.isnumeric() + + def eval_weird(x): + if x.startswith("."): + return float(f"0{x}") + else: + return eval(x) + + return (eval_weird(x) if x and weird_float(x) else x for x in iterable) + + class CrossEntropyLoss(pl.LightningModule): def __init__(self, pad_index, weights=None): super(CrossEntropyLoss, self).__init__() @@ -97,31 +121,121 @@ def get_optimizer( return optimizer, scheduler +class ArchitectureStringError(ValueError): + """ Error raised with wrong architecture string""" + + +def parse_architecture( + string: str, + maximum_sentence_size: Optional[int] = None +) -> Tuple[int, nn.ModuleDict, int]: + """ Returns an embedding dimension and a module list + + >>> BoudamsTagger.parse_architecture("[E200 C5,10]") + + Should result in `(200, nn.ModuleList([Conv(200, out_filters=10, filter_size=5)]))` + + >>> BoudamsTagger.parse_architecture("[E200 C5,10,2]") + Should result in `(200, nn.ModuleList([Conv(200, out_filters=10, filter_size=5, padding_size=2)]))` + """ + if string[0] != "[" and string[-1] != "]": + raise ArchitectureStringError("Architectures need top be encapsulated in [ ]") + string = string[1:-1] + modules: List[ModelWrapper] = [] + names: List[str] = [] + emb_dim = 0 + last_dim: int = 0 + for idx, module in enumerate(string.split()): + if idx == 0: + if _re_embedding.match(module): + emb_dim = eval(_re_embedding.match(module).group(1)) + last_dim = emb_dim + else: + raise ArchitectureStringError("First module needs to be an embedding module. Start with [E...]" + " where is a dimension, such as [E200") + elif _re_embedding.match(module): + raise ArchitectureStringError("You can't have embeddings after the first module") + elif _re_conv.match(module): + ngram, filter_dim, padding = _map_params(_re_conv.match(module).groups()) + modules.append(Conv( + input_dim=last_dim, + out_filters=filter_dim, + filter_size=ngram, + **(dict(padding_size=padding) if padding else {}) + )) + elif _re_sequential_conv.match(module): + use_sum, ngram, filter_dim, layers, drop = _map_params(_re_sequential_conv.match(module).groups()) + modules.append(SequentialConv( + input_dim=last_dim, + filter_dim=filter_dim, + filter_size=ngram, + n_layers=layers, + dropout=drop, + use_sum=use_sum or "" + )) + elif _re_pos.match(module): + activation, = _map_params(_re_pos.match(module).groups()) + modules.append(PosEmbedding( + input_dim=last_dim, + maximum_sentence_size=maximum_sentence_size, + activation=activation + )) + elif _re_bilstm.match(module): + hidden_dim, layers = _map_params(_re_bilstm.match(module).groups()) + modules.append(BiLSTM( + input_dim=last_dim, + hidden_dim=hidden_dim, + layers=layers + )) + elif _re_bigru.match(module): + hidden_dim, layers = _map_params(_re_bigru.match(module).groups()) + modules.append(BiGru( + input_dim=last_dim, + hidden_dim=hidden_dim, + layers=layers + )) + elif _re_linear.match(module): + dim, = _map_params(_re_linear.match(module).groups()) + modules.append(Linear( + input_dim=last_dim, + output_dim=dim + )) + elif _re_dropout.match(module): + rate, = _map_params(_re_dropout.match(module).groups()) + + modules.append(Dropout( + input_dim=last_dim, + rate=rate + )) + else: + raise ArchitectureStringError(f"Unknown `{module}` architecture") + + if len(modules): + last_dim = modules[-1].output_dim + names.append(module.replace(".", "_")) + + return emb_dim, nn.ModuleDict(OrderedDict(zip(names, modules))), last_dim + + class BoudamsTagger(pl.LightningModule): + def __init__( self, vocabulary: LabelEncoder, + architecture: str = "[E256 G256,2 D.3]", metric_average: str = "macro", - hidden_size: int = 256, - enc_n_layers: int = 10, - emb_enc_dim: int = 256, - enc_hid_dim: int = None, - enc_dropout: float = 0.5, - enc_kernel_size: int = 3, - out_max_sentence_length: int = 150, + maximum_sentence_size: int = 150, optimizer: Optional[OptimizerParams] = None, - system: str = "bi-gru", - have_metrics: bool = False, - **kwargs # RetroCompat + have_metrics: bool = False ): """ :param vocabulary: - :param hidden_size: - :param n_layers: - :param emb_enc_dim: - :param emb_dec_dim: - :param max_length: + :param architecture: + :param metric_average: + :param maximum_sentence_size: + :param optimizer: + :param have_metrics: """ super(BoudamsTagger, self).__init__() @@ -133,31 +247,31 @@ def __init__( self.optimizer_params.validate() self.lr = self.optimizer_params.kwargs.get("lr") - # Parse params and sizes - self.enc_hid_dim = self.dec_hid_dim = self.hidden_size = hidden_size - - if enc_hid_dim: - self.enc_hid_dim: int = enc_hid_dim - - self.emb_enc_dim: int = emb_enc_dim - self.enc_dropout: float = enc_dropout - self.enc_kernel_size: int = enc_kernel_size - self.enc_n_layers: int = enc_n_layers - - self.out_max_sentence_length: int = out_max_sentence_length - self.system: str = system + self._nb_classes = len(self.vocabulary.itom) + # Parse params and sizes + self._architecture: str = architecture + _emb_dims, sequence, last_dim = parse_architecture( + architecture, + maximum_sentence_size=maximum_sentence_size + ) + self._maximum_sentence_size: Optional[int] = maximum_sentence_size + self._emb_dims = _emb_dims + self._module_dict: nn.ModuleDict = sequence # Based on self.masked, decoder dimension can be drastically different - self.dec_dim = len(self.vocabulary.itom) - - # Build the module - self._build_nn() + self._embedder = nn.Embedding(self.vocabulary_dimension, self._emb_dims) + self._classifier = nn.Linear(last_dim, self._nb_classes) if self.optimizer_params: # ToDo: Allow for DiceLoss - self.train_loss = CrossEntropyLoss(weights=self.model.nll_weight, + # Needed when loading dict + nll_weight = torch.ones(self._nb_classes) + nll_weight[vocabulary.pad_token_index] = 0. + self.register_buffer('nll_weight', nll_weight) + + self.train_loss = CrossEntropyLoss(weights=self.nll_weight, pad_index=self.vocabulary.pad_token_index) - self.val_loss = CrossEntropyLoss(weights=self.model.nll_weight, + self.val_loss = CrossEntropyLoss(weights=self.nll_weight, pad_index=self.vocabulary.pad_token_index) if metric_average not in {"micro", "macro"}: @@ -184,55 +298,6 @@ def add_metrics(self, prefix, metric_average): multiclass=True )) - def _build_nn(self): - seq2seq_shared_params = { - "pad_idx": self.padtoken, - "out_max_sentence_length": self.out_max_sentence_length - } - - if self.system.endswith("-lstm"): - self.enc: linear.LSTMEncoder = linear.LinearLSTMEncoder( - self.vocabulary_dimension, emb_dim=self.emb_enc_dim, - n_layers=self.enc_n_layers, hid_dim=self.enc_hid_dim, - dropout=self.enc_dropout - ) - in_features = self.enc.output_dim - elif self.system.endswith("-gru"): - self.enc: linear.BiGruEncoder = linear.BiGruEncoder( - self.vocabulary_dimension, emb_dim=self.emb_enc_dim, - n_layers=self.enc_n_layers, hid_dim=self.enc_hid_dim, - dropout=self.enc_dropout - ) - in_features = self.enc.output_dim - elif self.system.endswith("-conv-no-pos"): - self.enc: linear.LinearEncoderCNNNoPos = linear.LinearEncoderCNNNoPos( - self.vocabulary_dimension, emb_dim=self.emb_enc_dim, - n_layers=self.enc_n_layers, hid_dim=self.enc_hid_dim, - dropout=self.enc_dropout, - kernel_size=self.enc_kernel_size - ) - in_features = self.emb_enc_dim - # This model does not need sentence length - self.out_max_sentence_length = None - else: - self.enc: linear.CNNEncoder = linear.LinearEncoderCNN( - self.vocabulary_dimension, emb_dim=self.emb_enc_dim, - n_layers=self.enc_n_layers, hid_dim=self.enc_hid_dim, - dropout=self.enc_dropout, - kernel_size=self.enc_kernel_size, - max_sentence_len=self.out_max_sentence_length - ) - in_features = self.emb_enc_dim - - self.dec: linear.LinearDecoder = linear.LinearDecoder( - enc_dim=in_features, out_dim=self.vocabulary.mode.classes_count - ) - self.model: linear.MainModule = linear.MainModule( - self.enc, self.dec, - pos="nopos" not in self.system, - **seq2seq_shared_params - ) - @property def padtoken(self): return self.vocabulary.pad_token_index @@ -240,41 +305,15 @@ def padtoken(self): @property def settings(self): return { - "enc_kernel_size": self.enc_kernel_size, - "enc_n_layers": self.enc_n_layers, - "hidden_size": self.hidden_size, - "enc_hid_dim": self.enc_hid_dim, - "emb_enc_dim": self.emb_enc_dim, - "enc_dropout": self.enc_dropout, - "out_max_sentence_length": self.out_max_sentence_length, - "system": self.system + "architecture": self._architecture, + "maximum_sentence_size": self._maximum_sentence_size } - @classmethod - def load(cls, fpath="./model.boudams_model"): - with tarfile.open(utils.ensure_ext(fpath, 'boudams_model'), 'r') as tar: - settings = json.loads(utils.get_gzip_from_tar(tar, 'settings.json.zip')) - - # load state_dict - #print(json.loads(utils.get_gzip_from_tar(tar, "vocabulary.json"))) - vocab = LabelEncoder.load( - json.loads(utils.get_gzip_from_tar(tar, "vocabulary.json")) - ) - - obj = cls(vocabulary=vocab, **settings) - - # load state_dict - with utils.tmpfile() as tmppath: - tar.extract('state_dict.pt', path=tmppath) - dictpath = os.path.join(tmppath, 'state_dict.pt') - obj.model.load_state_dict(torch.load(dictpath)) - - obj.model.eval() - - return obj - - def forward(self, x: torch.TensorType, x_len: Optional[torch.TensorType] = None) -> Any: - return self.model.forward(x, x_len) + def forward(self, x: torch.TensorType, x_len: Optional[torch.TensorType] = None) -> torch.Tensor: + after_seq_out = self._embedder(x) + for module in self._module_dict.values(): + after_seq_out, x_len = module(after_seq_out, x_len) + return self._classifier(after_seq_out) def training_step(self, batch, batch_idx): # -> pl.utilities.types.STEP_OUTPUT: """ Runs training step on a batch @@ -361,7 +400,7 @@ def _eval_step(self, batch, batch_idx, prefix: str): return x, y, gt, {"confusion_matrix": matrix} def _view_y_gt(self, y, gt): - return y.view(-1, self.model.decoder.out_dim), gt.view(-1) + return y.view(-1, self._nb_classes), gt.view(-1) def configure_optimizers(self): optimizer, scheduler = self.optimizer_params.get_optimizer( @@ -381,11 +420,15 @@ def configure_optimizers(self): ) def annotate(self, texts: List[str], batch_size=32, device: str = "cpu"): - self.model.eval() + self.eval() for n in range(0, len(texts), batch_size): batch = texts[n:n+batch_size] xs = [ - self.vocabulary.sent_to_numerical(self.vocabulary.prepare(s)) + self.vocabulary.sent_to_numerical( + self.vocabulary.mode.prepare_input( + self.vocabulary.prepare(s) + ) + ) for s in batch ] logging.info("Dealing with batch %s " % (int(n/batch_size)+1)) @@ -397,8 +440,8 @@ def annotate(self, texts: List[str], batch_size=32, device: str = "cpu"): if device != "cpu": tensor, sentence_length = tensor.to(device), sentence_length.to(device) - translations = self.model.predict( - tensor, sentence_length, label_encoder=self.vocabulary, + translations = self._string_predict( + tensor, sentence_length, override_src=[batch[order_id] for order_id in order] ) for index in range(len(translations)): @@ -412,9 +455,9 @@ def annotate_text(self, string, splitter=r"([⁊\W\d]+)", batch_size=32, device: strings = ["".join(tempList[n:n + 2]) for n in range(0, len(splits), 2)] strings = list(filter(lambda x: x.strip(), strings)) - if self.out_max_sentence_length: + if self._maximum_sentence_size: treated = [] - max_size = self.out_max_sentence_length - 5 + max_size = self._maximum_sentence_size for string in strings: if len(string) > max_size: treated.extend([ @@ -426,6 +469,28 @@ def annotate_text(self, string, splitter=r"([⁊\W\d]+)", batch_size=32, device: strings = treated yield from self.annotate(strings, batch_size=batch_size, device=device) + @classmethod + def load(cls, fpath="./model.boudams_model"): + with tarfile.open(utils.ensure_ext(fpath, 'boudams_model'), 'r') as tar: + settings = json.loads(utils.get_gzip_from_tar(tar, 'settings.json.zip')) + + vocab = LabelEncoder.load( + json.loads(utils.get_gzip_from_tar(tar, "vocabulary.json")) + ) + + obj = cls(vocabulary=vocab, **settings) + + # load state_dict + with utils.tmpfile() as tmppath: + tar.extract('state_dict.pt', path=tmppath) + dictpath = os.path.join(tmppath, 'state_dict.pt') + # Strict false for predict (nll_weight is removed) + obj.load_state_dict(torch.load(dictpath), strict=False) + + obj.eval() + + return obj + def dump(self, fpath="model"): fpath += ".boudams_model" fpath = utils.ensure_ext(fpath, 'boudams_model', infix=None) @@ -451,7 +516,26 @@ def dump(self, fpath="model"): # serialize field with utils.tmpfile() as tmppath: - torch.save(self.model.state_dict(), tmppath) + torch.save(self.state_dict(), tmppath) tar.add(tmppath, arcname='state_dict.pt') return fpath + + def _string_predict( + self, + src, + src_len, + override_src: Optional[List[str]] = None + ) -> torch.Tensor: + """ Predicts value for a given tensor + :param src: tensor(batch size x sentence_length) + :param src_len: tensor(batch size) + :param label_encoder: Encoder + :return: Reversed Batch + """ + out = self(src, src_len) + logits = torch.argmax(out, -1) + return self.vocabulary.reverse_batch( + input_batch=src, + mask_batch=logits + )