diff --git a/CustomVGSL.md b/CustomVGSL.md
index c7e918a..c3fd99c 100644
--- a/CustomVGSL.md
+++ b/CustomVGSL.md
@@ -6,11 +6,13 @@ The new spec system is built around custom architecture strings.
 
 Available modules:
 
-- `C[A]<x>,<d>` uses a convolutional layer where `x` is the n-gram window and `d` the output.
-- `CP[A]<x>,<d>` uses a convolutional layer with positional embeddings where `x` is the n-gram window and `d` the output.
-- `L[A]<h>,<l>` uses a Bi-LSTM layer where `h` is the hidden size and `l` the number of layers.
-- `G[A]<h>,<l>` uses a Bi-GRU layer where `h` is the hidden size and `l` the number of layers.
-- `D<r>` uses a Dropout layer with a rate of `r`
+- `C<x>,<d>[,<p>]` uses a convolutional layer where `x` is the n-gram window and `d` the output. `p` is an optional padding.
+- `CS[s]<x>,<d>,<l>[,Do<r>]` uses a sequential convolutional layer where `x` is the "n-gram window", `d` the output, `l` the number. Can have an optional `Dropout` rate between each convolution. 
+of layers. `[s]` (`CSs`) use a final addition of conved output + original input with a scale   
+- `P[l]` adds a positional embeddings with an optional linear activation (eg. `Pl`).
+- `L<h>,<l>` uses a Bi-LSTM layer where `h` is the hidden size and `l` the number of layers.
+- `G<h>,<l>` uses a Bi-GRU layer where `h` is the hidden size and `l` the number of layers.
+- `Do<r>` uses a Dropout layer with a rate of `r`
 - `L<d>` uses a Linear layer of dimension `d`
 
 `[A]` can be replaced with an activation layer, such as:
@@ -21,8 +23,15 @@ Available modules:
 - `l` = linear (i.e., No non-linearity)
 - `m` = softmax
 - `n` = n/a
+- `g` = GLU
 
 The VGSL module must starts with an embedding size: `E<dim>`.
 
 Example: `[E200 L120 L200 Cr3,10 D3]` will use a Convolutional Layer of (3 ngram for 10 of dim) and a relu activation
 over which 30% of dropout is applied before classification
+
+## Legacy architectures
+
+- ConvPos `[E256 Pl Do.3 CS5,256,10,Do.25 L256]` 
+- ConvNoPos `[E256 Do.3 CS5,256,10,Do.25 L256]` 
+- Gru `[E256 Do.3 CSs5,256,10Do.25 L256]`
\ No newline at end of file
diff --git a/boudams/cli.py b/boudams/cli.py
index b0f1238..48d3d6f 100644
--- a/boudams/cli.py
+++ b/boudams/cli.py
@@ -180,11 +180,18 @@ def template(filename):
 
 
 @cli.command("train")
-@click.argument("config_files", nargs=-1, type=click.File("r"))
+@click.argument("train-set", type=click.Path(file_okay=True, exists=True, dir_okay=False))
+@click.argument("dev-set", type=click.Path(file_okay=True, exists=True, dir_okay=False))
+@click.argument("test-set", type=click.Path(file_okay=True, exists=True, dir_okay=False))
+@click.argument("output", type=click.Path(dir_okay=False, exists=False))
+@click.option("--architecture", type=str, help="VGSL-Like architecture.",
+              default="[E256 Pl Do.3 CSs5,256,10Do.25 L256]", show_default=True)
 @click.option("--mode", type=click.Choice(_POSSIBLE_MODES),
               default="simple-space", show_default=True,
               help="Type of encoder you want to set-up")
-@click.option("--output", type=click.Path(dir_okay=False, exists=False), default=None, help="Model Name")
+@click.option("--normalize", type=bool, is_flag=True, default=False, help="Normalize string input with unidecode"
+                                                                          " or mufidecode")
+@click.option("--lower", type=bool, is_flag=True, default=False, help="Lower strings")
 @click.option("--epochs", type=int, default=100, help="Number of epochs to run")
 @click.option("--batch_size", type=int, default=32, help="Size of batches")
 @click.option("--device", default="cpu", help="Device to use for the network (cuda:0, cpu, etc.)")
@@ -194,21 +201,35 @@ def template(filename):
 @click.option("--metric", default="f1", type=click.Choice(ACCEPTABLE_MONITOR_METRICS), help="Metric to monitor")
 @click.option("--avg", default="macro", type=click.Choice(["micro", "macro"]), help="Type of avering method to use on "
                                                                                     "metrics")
+@click.option("--lr", default=.0001, type=float, help="Learning rate",
+              show_default=True)
 @click.option("--delta", default=.001, type=float, help="Minimum change in the monitored quantity to qualify as an "
-                                                        "improvement")
-@click.option("--patience", default=3, type=int, help="Number of checks with no improvement after which training "
-                                                      "will be stopped")
+                                                        "improvement",
+              show_default=True)
+@click.option("--patience", default=5, type=int, help="Number of checks with no improvement after which training "
+                                                      "will be stopped",
+              show_default=True)
+@click.option("--lr-patience", default=3, type=int, help="Number of checks with no improvement for lowering LR",
+              show_default=True)
+@click.option("--shuffle/--no-shuffle", type=bool, is_flag=True, default=True,
+              help="Suppress the shuffling of datasets", show_default=True)
+@click.option("--lr-factor", default=.5, type=float, help="Ratio for lowering LR", show_default=True)
 @click.option("--seed", default=None, type=int, help="Runs deterministic training")
 @click.option("--optimizer", default="Adams", type=click.Choice(["Adams"]), help="Optimizer to use")
 # ToDo: Figure out the bug with Ranger
 # pytorch_lightning.utilities.exceptions.MisconfigurationException: The closure hasn't been executed. HINT: did you call
 # `optimizer_closure()` in your `optimizer_step` hook? It could also happen because the
 # `optimizer.step(optimizer_closure)` call did not execute it internally.
-def train(config_files: List[click.File], output: str, mode: str,
-          epochs: int, batch_size: int, device: str, debug: bool, workers: int,
-          auto_lr: bool,
-          metric: str, avg: str, delta: float, patience: int,
-          seed: int, optimizer: str):
+def train(
+        train_set: str, dev_set: str, test_set: str,
+        architecture: str, output: str, mode: str,
+        normalize: bool, lower: bool,
+        epochs: int, batch_size: int, device: str, debug: bool, workers: int,
+        auto_lr: bool,
+        metric: str, avg: str,
+        lr: float, delta: float, patience: int,
+        lr_patience: int, lr_factor: float,
+        seed: int, optimizer: str, shuffle: bool):
     """ Train one or more models according to [CONFIG_FILES] JSON configurations"""
     if debug:
         logger.setLevel(logging.DEBUG)
@@ -218,100 +239,99 @@ def train(config_files: List[click.File], output: str, mode: str,
     if seed:
         pl.seed_everything(seed, workers=True)
 
+    device = device.lower()
     if device == 'cpu':
         device = None
     elif device.startswith('cuda'):
         device = [int(device.split(':')[-1])]
+    else:
+        click.echo(click.style("Device is invalid. Either use `cpu` or `cuda:0`, `cuda:1`", fg="red"))
+        return
 
-    for config_file in config_files:
-        config = json.load(config_file)
-
-        train_path, dev_path, test_path = config["datasets"]["train"],\
-                                          config["datasets"]["dev"],\
-                                          config["datasets"]["test"]
-
-        vocabulary = LabelEncoder(
-            maximum_length=config.get("max_sentence_size", None),
-            mode=mode,
-            remove_diacriticals=config["label_encoder"].get("normalize", True),
-            lower=config["label_encoder"].get("lower", True)
-        )
-        vocabulary.build(train_path, dev_path, test_path, debug=True)
-        if debug:
-            from pprint import pprint
-            pprint(vocabulary.mtoi)
-
-        # Get the datasets
-        train_dataset: BoudamsDataset = vocabulary.get_dataset(train_path)
-        dev_dataset: BoudamsDataset = vocabulary.get_dataset(dev_path)
-        test_dataset: BoudamsDataset = vocabulary.get_dataset(test_path)
-
-        logger.info("Training %s " % config_file.name)
-        logger.info("-- Dataset informations --")
-        logger.info(f"Number of training examples: {len(train_dataset)}")
-        logger.info(f"Number of dev examples: {len(dev_dataset)}")
-        logger.info(f"Number of testing examples: {len(test_dataset)}")
-        logger.info(f"Vocabulary Size: {len(vocabulary)}")
-        logger.info("--------------------------")
+    train_path, dev_path, test_path = train_set, dev_set, test_set
 
-        tagger = BoudamsTagger(
-            vocabulary,
-            system=config["model"],
-            out_max_sentence_length=config.get("max_sentence_size", None),
-            metric_average=avg,
-            optimizer=OptimizerParams(
-                optimizer,
-                kwargs={"lr": config["learner"]["lr"]},
-                scheduler={
-                    "patience": config["learner"].get("lr_patience", None),
-                    "factor": config["learner"].get("lr_factor", None),
-                    "threshold": delta
-                }
-            ),
-            **config["network"]
-        )
-        trainer = Trainer(
-            gpus=device,
-            patience=patience,
-            min_delta=delta,
-            monitor=metric,
-            max_epochs=epochs,
-            gradient_clip_val=1,
-            model_name=output or (config["name"] + str(datetime.datetime.today()).replace(" ", "--").split(".")[0]),
-            #  n_epochs=epochs,
-            auto_lr_find=auto_lr,
-            deterministic=True if seed else False
+    vocabulary = LabelEncoder(
+        mode=mode,
+        remove_diacriticals=normalize,
+        lower=lower
+    )
+    maximum_sentence_size = vocabulary.build(train_path, dev_path, test_path, debug=True)
+    if debug:
+        from pprint import pprint
+        pprint(vocabulary.mtoi)
+
+    # Get the datasets
+    train_dataset: BoudamsDataset = vocabulary.get_dataset(train_path)
+    dev_dataset: BoudamsDataset = vocabulary.get_dataset(dev_path)
+    test_dataset: BoudamsDataset = vocabulary.get_dataset(test_path)
+
+    logger.info("Architecture %s " % architecture)
+    logger.info("-- Dataset informations --")
+    logger.info(f"Number of training examples: {len(train_dataset)}")
+    logger.info(f"Number of dev examples: {len(dev_dataset)}")
+    logger.info(f"Number of testing examples: {len(test_dataset)}")
+    logger.info(f"Vocabulary Size: {len(vocabulary)}")
+    logger.info("--------------------------")
+
+    tagger = BoudamsTagger(
+        vocabulary,
+        architecture=architecture,
+        maximum_sentence_size=maximum_sentence_size,
+        metric_average=avg,
+        optimizer=OptimizerParams(
+            optimizer,
+            kwargs={"lr": lr},
+            scheduler={
+                "patience": lr_patience,
+                "factor": lr_factor,
+                "threshold": delta
+            }
         )
-        train_dataloader, dev_dataloader = (
-            DataLoader(
-                train_dataset,
-                batch_size=batch_size,
-                shuffle=config["datasets"].get("random", True),
-                collate_fn=train_dataset.train_collate_fn,
-                num_workers=workers
-            ),
-            DataLoader(
-                dev_dataset,
-                batch_size=batch_size,
-                shuffle=config["datasets"].get("random", True),
-                collate_fn=dev_dataset.train_collate_fn,
-                num_workers=workers
-            )
+    )
+    trainer = Trainer(
+        gpus=device,
+        patience=patience,
+        min_delta=delta,
+        monitor=metric,
+        max_epochs=epochs,
+        gradient_clip_val=1,
+        model_name=output,
+        #  n_epochs=epochs,
+        auto_lr_find=auto_lr,
+        deterministic=True if seed else False
+    )
+    train_dataloader, dev_dataloader = (
+        DataLoader(
+            train_dataset,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            collate_fn=train_dataset.train_collate_fn,
+            num_workers=workers
+        ),
+        DataLoader(
+            dev_dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            collate_fn=dev_dataset.train_collate_fn,
+            num_workers=workers
         )
-        if auto_lr:
-            trainer.tune(tagger, train_dataloader, dev_dataloader)
-            return
-        trainer.fit(tagger, train_dataloader, dev_dataloader)
+    )
 
-        trainer.test(
-            tagger,
-            DataLoader(
-                test_dataset,
-                batch_size=batch_size,
-                collate_fn=test_dataset.train_collate_fn,
-                num_workers=workers
-            )
+    if auto_lr:
+        trainer.tune(tagger, train_dataloader, dev_dataloader)
+        return
+    trainer.fit(tagger, train_dataloader, dev_dataloader)
+
+    trainer.test(
+        tagger,
+        DataLoader(
+            test_dataset,
+            batch_size=batch_size,
+            collate_fn=test_dataset.train_collate_fn,
+            num_workers=workers,
+            shuffle=False
         )
+    )
 
 
 @cli.command("test")
diff --git a/boudams/encoder.py b/boudams/encoder.py
index 24be08b..f4512b4 100644
--- a/boudams/encoder.py
+++ b/boudams/encoder.py
@@ -76,11 +76,11 @@ def mode(self):
     def __len__(self):
         return len(self.stoi)
 
-    def build(self, *paths, debug=False):
+    def build(self, train, *paths, debug=False) -> int:
         """ Builds vocabulary
 
         :param paths: Path of file to read
-        :return:
+        :return: Maximum sentence size
         """
         recorded_chars = set()
         counter = None
@@ -88,10 +88,14 @@ def build(self, *paths, debug=False):
             counter = collections.Counter()
 
         logging.info("Reading files for vocabulary building")
-        for path in paths:
+        max_sentence_size = 0
+        for path_idx, path in enumerate([train, *paths]):
             with open(path) as fio:
                 for line in fio.readlines():
                     x, _ = self.readunit(line)
+                    seq_len = len(x)
+                    if seq_len > max_sentence_size:
+                        max_sentence_size = seq_len
                     recorded_chars.update(set(list(x)))
 
         logging.info("Saving {} chars to label encoder".format(len(recorded_chars)))
@@ -102,6 +106,8 @@ def build(self, *paths, debug=False):
                 # Reuse index for string retrieval
                 self.itos[self.stoi[char]] = char
 
+        return max_sentence_size
+
     def readunit(self, line) -> Tuple[str, str]:
         """ Read a single line
 
diff --git a/boudams/model/__init__.py b/boudams/model/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/boudams/model/bidir.py b/boudams/model/bidir.py
deleted file mode 100644
index 9b41501..0000000
--- a/boudams/model/bidir.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch.nn as nn
-
-
-class Encoder(nn.Module):
-    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
-        super().__init__()
-
-        self.input_dim = input_dim
-        self.emb_dim = emb_dim
-        self.hid_dim = hid_dim
-        self.n_layers = n_layers
-        self.dropout = dropout
-
-        self.embedding = nn.Embedding(input_dim, emb_dim)
-
-        self.rnn = nn.GRU(
-            emb_dim, hid_dim, n_layers,
-            dropout=dropout, bidirectional=True, batch_first=True
-        )
-
-        self.dropout = nn.Dropout(dropout)
-
-    @property
-    def output_dim(self):
-        return self.hid_dim * 2
-
-    def forward(self, src):
-
-        # src = [src sent len, batch size]
-        embedded = self.dropout(self.embedding(src))
-
-        # embedded = [src sent len, batch size, emb dim]
-
-        # packed_outputs = [src sent len, batch size, hid dim * n directions]
-        # hidden = [n layers * n directions, batch size, hid dim]
-        output, hidden = self.rnn(embedded)
-
-        return output, hidden
-
-    def init_weights(self):
-        for name, param in self.named_parameters():
-            nn.init.uniform_(param.data, -0.08, 0.08)
-
diff --git a/boudams/model/conv.py b/boudams/model/conv.py
deleted file mode 100644
index fdb0f06..0000000
--- a/boudams/model/conv.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Encoder(nn.Module):
-    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout,
-                 max_sentence_len: int = 100):
-        super().__init__()
-
-        assert kernel_size % 2 == 1, "Kernel size must be odd!"
-
-        self.input_dim = input_dim
-        self.emb_dim = emb_dim
-        self.hid_dim = hid_dim
-        self.kernel_size = kernel_size
-        self.dropout = dropout
-        self.max_sentence_len = max_sentence_len
-
-        self.scale = torch.sqrt(torch.FloatTensor([0.5]))
-
-        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
-        self.pos_embedding = nn.Embedding(max_sentence_len, emb_dim)
-
-        self.emb2hid = nn.Linear(emb_dim, hid_dim)
-        self.hid2emb = nn.Linear(hid_dim, emb_dim)
-
-        self.convs = nn.ModuleList([nn.Conv1d(in_channels=hid_dim,
-                                              out_channels=2 * hid_dim,
-                                              kernel_size=kernel_size,
-                                              padding=(kernel_size - 1) // 2)
-                                    for _ in range(n_layers)])
-
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, src):
-        # create position tensor
-
-        # pos = [src sent len, batch size] (Not what is documented)
-        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).type_as(src)
-
-        # embed tokens and positions
-        tok_embedded = self.tok_embedding(src)
-        pos_embedded = self.pos_embedding(pos)
-
-        # tok_embedded = pos_embedded = [batch size, src sent len, emb dim]
-
-        # combine embeddings by elementwise summing
-        embedded = self.dropout(tok_embedded + pos_embedded)
-
-        # embedded = [batch size, src sent len, emb dim]
-
-        # pass embedded through linear layer to go through emb dim -> hid dim
-        conv_input = self.emb2hid(embedded)
-
-        # conv_input = [batch size, src sent len, hid dim]
-
-
-        # permute for convolutional layer
-        conv_input = conv_input.permute(0, 2, 1)
-
-        # conv_input = [batch size, hid dim, src sent len]
-        self.scale = self.scale.type_as(conv_input)
-        for i, conv in enumerate(self.convs):
-            # pass through convolutional layer
-            conved = conv(self.dropout(conv_input))
-
-            # conved = [batch size, 2*hid dim, src sent len]
-
-            # pass through GLU activation function
-            conved = F.glu(conved, dim=1)
-
-            # conved = [batch size, hid dim, src sent len]
-
-            # apply residual connection
-            conved = (conved + conv_input) * self.scale
-
-            # conved = [batch size, hid dim, src sent len]
-
-            # set conv_input to conved for next lo`op iteration
-            conv_input = conved
-
-        # permute and convert back to emb dim
-        conved = self.hid2emb(conved.permute(0, 2, 1))
-
-        # conved = [batch size, src sent len, emb dim]
-
-        # elementwise sum output (conved) and input (embedded) to be used for attention
-        combined = (conved + embedded) * self.scale
-
-        # combined = [batch size, src sent len, emb dim]
-        return conved, combined
diff --git a/boudams/model/linear.py b/boudams/model/linear.py
deleted file mode 100644
index 9e5e5ab..0000000
--- a/boudams/model/linear.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-from typing import Optional, List
-
-
-from .conv import Encoder as CNNEncoder
-from .lstm import Encoder as LSTMEncoder
-from .bidir import Encoder as BiGruEncoder
-
-
-class LinearEncoderCNN(CNNEncoder):
-    def forward(self, src, keep_pos=False):
-        o, p = super(LinearEncoderCNN, self).forward(src)
-        if keep_pos:
-            return p
-        return o
-
-
-class LinearLSTMEncoder(LSTMEncoder):
-    """ Linear
-     version of the LSTMEncoder """
-
-
-class LinearEncoderCNNNoPos(nn.Module):
-    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout):
-        super().__init__()
-
-        assert kernel_size % 2 == 1, "Kernel size must be odd!"
-
-        self.input_dim = input_dim
-        self.emb_dim = emb_dim
-        self.hid_dim = hid_dim
-        self.kernel_size = kernel_size
-        self.dropout = dropout
-
-        self.scale = torch.sqrt(torch.FloatTensor([0.5]))
-
-        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
-
-        self.emb2hid = nn.Linear(emb_dim, hid_dim)
-        self.hid2emb = nn.Linear(hid_dim, emb_dim)
-
-        self.convs = nn.ModuleList([nn.Conv1d(in_channels=hid_dim,
-                                              out_channels=2 * hid_dim,
-                                              kernel_size=kernel_size,
-                                              padding=(kernel_size - 1) // 2)
-                                    for _ in range(n_layers)])
-
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, src):
-        # create position tensor
-
-        # pos = [src sent len, batch size] (Not what is documented)
-        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1)
-
-        # embed tokens and positions
-        tok_embedded = self.tok_embedding(src)
-
-        # tok_embedded = pos_embedded = [batch size, src sent len, emb dim]
-
-        # combine embeddings by elementwise summing
-        embedded = self.dropout(tok_embedded)
-
-        # embedded = [batch size, src sent len, emb dim]
-
-        # pass embedded through linear layer to go through emb dim -> hid dim
-        conv_input = self.emb2hid(embedded)
-
-        # conv_input = [batch size, src sent len, hid dim]
-
-
-        # permute for convolutional layer
-        conv_input = conv_input.permute(0, 2, 1)
-
-        # conv_input = [batch size, hid dim, src sent len]
-
-        self.scale = self.scale.type_as(conv_input)
-
-        for i, conv in enumerate(self.convs):
-            # pass through convolutional layer
-            conved = conv(self.dropout(conv_input))
-
-            # conved = [batch size, 2*hid dim, src sent len]
-
-            # pass through GLU activation function
-            conved = F.glu(conved, dim=1)
-
-            # conved = [batch size, hid dim, src sent len]
-
-            # apply residual connection
-            conved = (conved + conv_input) * self.scale
-
-            # conved = [batch size, hid dim, src sent len]
-
-            # set conv_input to conved for next lo`op iteration
-            conv_input = conved
-
-        # permute and convert back to emb dim
-        conved = self.hid2emb(conved.permute(0, 2, 1))
-
-        # conved = [batch size, src sent len, emb dim]
-
-        # combined = [batch size, src sent len, emb dim]
-        return conved
-
-
-class LinearDecoder(nn.Module):
-    """
-    Simple Linear Decoder that outputs a probability distribution
-    over the vocabulary
-    Parameters
-    ===========
-    label_encoder : LabelEncoder
-    in_features : int, input dimension
-    """
-    def __init__(self, enc_dim, out_dim, highway_layers=0, highway_act='relu'):
-        super().__init__()
-        self.out_dim = out_dim
-        # highway
-        self.highway = None
-        # decoder output
-        self.decoder = nn.Linear(enc_dim, out_dim)
-
-        self.relu = True
-
-    def forward(self, enc_outs):
-        if self.highway is not None:
-            enc_outs = self.highway(enc_outs)
-
-        return self.decoder(enc_outs)
-
-
-class MainModule(nn.Module):
-    masked_only = True
-
-    def __init__(
-        self,
-        encoder: CNNEncoder, decoder: LinearDecoder,
-        pad_idx: int,
-        pos: bool = False,
-        **kwargs
-    ):
-        super().__init__()
-
-        self.encoder = encoder
-        self.decoder: LinearDecoder = decoder
-        self.pos = pos
-
-        self.pad_idx = pad_idx
-
-        # nll weight
-        nll_weight = torch.ones(decoder.out_dim)
-        nll_weight[pad_idx] = 0.
-        self.register_buffer('nll_weight', nll_weight)
-
-    def forward(self, src, src_len, trg=None, **kwargs):
-        # src = [batch size, src sent len]
-        # trg = [batch size, trg sent len]
-
-        # calculate z^u (encoder_conved) and e (encoder_combined)
-        # encoder_conved is output from final encoder conv. block
-        # encoder_combined is encoder_conved plus (elementwise) src embedding plus positional embeddings
-        if isinstance(self.encoder, LinearEncoderCNN):
-            second_step = self.encoder(src, keep_pos=True)
-        elif isinstance(self.encoder, LinearEncoderCNNNoPos):
-            second_step = self.encoder(src)
-        elif isinstance(self.encoder, LinearLSTMEncoder):
-            second_step, hidden, cell = self.encoder(src.t())
-            # -> tensor(sentence size, batch size, hid dim * n directions)
-        elif isinstance(self.encoder, BiGruEncoder):
-            second_step, hidden = self.encoder(src)
-            # -> tensor(sentence size, batch size, hid dim * n directions)
-            # second_step = second_step.transpose(1, 0)
-            # -> tensor(batch size, sentence size, hid dim * n directions)
-        else:
-            raise AttributeError("The encoder is not recognized.")
-
-        output = self.decoder(second_step)
-        return output
-
-    def predict(
-            self,
-            src,
-            src_len,
-            label_encoder: "LabelEncoder",
-            override_src: Optional[List[str]] = None
-    ) -> torch.Tensor:
-        """ Predicts value for a given tensor
-
-        :param src: tensor(batch size x sentence_length)
-        :param src_len: tensor(batch size)
-        :param label_encoder: Encoder
-        :return: Reversed Batch
-        """
-        out = self(src, src_len, None, teacher_forcing_ratio=0)
-        logits = torch.argmax(out, -1)
-        return label_encoder.reverse_batch(
-            input_batch=src,
-            mask_batch=logits
-        )
diff --git a/boudams/model/lstm.py b/boudams/model/lstm.py
deleted file mode 100644
index 5dc45eb..0000000
--- a/boudams/model/lstm.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import torch.nn as nn
-
-
-class Encoder(nn.Module):
-    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
-        super().__init__()
-
-        self.input_dim = input_dim
-        self.emb_dim = emb_dim
-        self.hid_dim = hid_dim
-        self.n_layers = n_layers
-        self.dropout = dropout
-
-        self.embedding = nn.Embedding(input_dim, emb_dim)
-
-        self.rnn = nn.LSTM(
-            emb_dim, hid_dim, n_layers,
-            dropout=dropout, bidirectional=True, batch_first=True
-        )
-
-        self.dropout = nn.Dropout(dropout)
-
-    @property
-    def output_dim(self):
-        return 2 * self.hid_dim
-
-    def forward(self, src):
-
-        # src = [src sent len, batch size]
-        # ToDo: Check PackPadded given the results ?
-        embedded = self.dropout(self.embedding(src))
-
-        # embedded = [src sent len, batch size, emb dim]
-
-        # packed_outputs = [src sent len, batch size, hid dim * n directions]
-        # hidden = [n layers * n directions, batch size, hid dim]
-        # cell = [n layers * n directions, batch size, hid dim]
-        output, (hidden, cell) = self.rnn(embedded)
-
-        return output, hidden, cell
-
-    def init_weights(self):
-        for name, param in self.named_parameters():
-            nn.init.uniform_(param.data, -0.08, 0.08)
-
diff --git a/boudams/modes.py b/boudams/modes.py
index 4444206..d5f3388 100644
--- a/boudams/modes.py
+++ b/boudams/modes.py
@@ -30,10 +30,9 @@ def __init__(self, masks: Dict[str, int] = None):
             DEFAULT_MASK_TOKEN: 1,
             DEFAULT_WB_TOKEN: 2
         }
-        self.index_to_mask: Dict[str, int] = masks or {
-            0: DEFAULT_PAD_TOKEN,
-            1: DEFAULT_MASK_TOKEN,
-            2: DEFAULT_WB_TOKEN
+        self.index_to_mask: Dict[str, int] = {
+            value: key
+            for value, key in self.masks_to_index.items()
         }
         self.index_to_masks_name: Dict[int, str] = {
             0: "PAD",
@@ -80,12 +79,12 @@ def generate_mask(
         :return:
 
         >>> (SimpleSpaceMode()).generate_mask("j'ai un cheval")
-        ('xxx|x|xxxxx|', "j'aiuncheval")
+        ("j'aiuncheval", '---|-|-----|')
         """
-        split = self._space.split(string)
+        split = self._space.split(string.strip())
         masks = DEFAULT_WB_TOKEN.join([DEFAULT_MASK_TOKEN * (len(tok)-1) for tok in split]) + DEFAULT_WB_TOKEN
         model_input = "".join(split)
-        assert len(masks) == len(model_input), f"Length of input and mask should be equal `{masks}` + `{model_input}`"
+        assert len(masks) == len(model_input), f"Length of input and mask should be equal `{masks}` + `{model_input}` + `{string}`"
         return model_input, masks
 
     def encode_mask(self, masked_string: Sequence[str]) -> List[int]:
diff --git a/boudams/modules/__init__.py b/boudams/modules/__init__.py
new file mode 100644
index 0000000..f52418a
--- /dev/null
+++ b/boudams/modules/__init__.py
@@ -0,0 +1,311 @@
+from typing import Optional, Callable, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.utils.rnn as rnn_utils
+
+
+__all__ = [
+    "ModelWrapper",
+    "PosEmbedding",
+    "Conv", "SequentialConv",
+    "Dropout",
+    "BiLSTM", "BiGru",
+    "Linear"
+]
+
+
+class ModelWrapper(nn.Module):
+    def __init__(self, input_dim: int, use_positional: bool = False):
+        super(ModelWrapper, self).__init__()
+        self._input_dim: int = input_dim
+        self._output_dim: int = 0
+        self._nn: nn.Module = nn.Module()
+        self._use_positional: bool = use_positional
+
+    @property
+    def input_dim(self):
+        return self._input_dim
+
+    @property
+    def output_dim(self) -> int:
+        return self._output_dim
+
+    def _forward(
+            self,
+            inp: torch.Tensor,
+            inp_length: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        raise NotImplementedError()
+
+    def forward(self,
+                inp: torch.Tensor,
+                inp_length: Optional[torch.Tensor] = None
+                ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        return self._forward(inp, inp_length), inp_length
+
+    def init_weights(self):
+        return
+
+
+class Dropout(ModelWrapper):
+    def __init__(
+        self,
+        input_dim,
+        rate: float
+    ):
+        super(Dropout, self).__init__(input_dim=input_dim)
+        self._rate = rate
+        self._output_dim = input_dim
+        self._nn = nn.Dropout(self._rate)
+
+    def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return self._nn(inp)
+
+
+class Linear(ModelWrapper):
+    def __init__(
+        self,
+        input_dim,
+        output_dim: int
+    ):
+        super(Linear, self).__init__(input_dim=input_dim, use_positional=False)
+        self._nn = nn.Linear(in_features=input_dim, out_features=output_dim)
+        # Directions*Hidden_Dim
+        self._output_dim = output_dim
+
+    def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return self._nn(inp)
+
+
+class Conv(ModelWrapper):
+    """
+    LICENSE: https://github.com/allenai/allennlp/blob/master/LICENSE
+    """
+    def __init__(
+            self,
+            input_dim,
+            out_filters: int,
+            filter_size: int,
+            padding_size: Optional[int] = 0,
+            activation: Optional[str] = "g"
+    ):
+        super(Conv, self).__init__(input_dim=input_dim, use_positional=False)
+        self._padding_size = padding_size
+        self._nn = nn.Conv1d(
+            in_channels=input_dim,
+            out_channels=out_filters if activation != "g" else 2*out_filters,
+            kernel_size=(filter_size, ),
+            padding=padding_size
+        )
+        self._output_dim = out_filters
+
+        if activation == "g":
+            self.activation: Callable[[torch.Tensor], torch.Tensor] = lambda conv_output: F.glu(conv_output, dim=1)
+
+    def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # Input is (batch_size, sequence_length, dimension)
+        #   The convolution layers expect input of shape `(batch_size, in_channels, sequence_length)
+        #   We permute last two dimensions
+
+        # conv_input = [batch size, encoding dim, src sent len]
+        conv_input = inp.permute(0, 2, 1)
+
+        # If we have glu, out_dim is 2*out_filters then out_filters at activation time
+        # Otherwise, always out_filters
+        # conved = [batch size, 2*out_filters, src sent len]
+        conved = self._nn(conv_input)
+        return self.activation(conved).permute(0, 2, 1)
+
+
+class PosEmbedding(ModelWrapper):
+    """
+    LICENSE: https://github.com/allenai/allennlp/blob/master/LICENSE
+    """
+    def __init__(
+            self,
+            input_dim,
+            maximum_sentence_size: int,
+            padding_size: Optional[int] = 0,
+            activation: Optional[str] = None
+    ):
+        super(PosEmbedding, self).__init__(input_dim=input_dim, use_positional=False)
+        self._padding_size = padding_size
+        self._output_dim = self._input_dim
+
+        self._nn: nn.Embedding = nn.Embedding(maximum_sentence_size, input_dim)
+        self.activation: Optional[nn.Linear] = None
+        if activation:
+            self.activation = nn.Linear(input_dim, input_dim)
+
+    def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # Input is (batch_size, sequence_length, dimension)
+        #   The convolution layers expect input of shape `(batch_size, in_channels, sequence_length)
+        #   We permute last two dimensions
+
+        pos = torch.arange(0, inp.shape[1], device=inp.device).unsqueeze(0).repeat(inp.shape[0], 1)
+
+        inp = inp + self._nn(pos)
+        if self.activation is not None:
+            return self.activation(inp)
+        return inp
+
+
+class BiLSTM(ModelWrapper):
+    def __init__(
+            self,
+            input_dim,
+            hidden_dim: int,
+            padding_idx = 0,
+            layers: int = 1
+    ):
+        super(BiLSTM, self).__init__(input_dim=input_dim, use_positional=False)
+        self._nn = nn.LSTM(
+            input_size=input_dim,
+            hidden_size=hidden_dim,
+            num_layers=layers,
+            bidirectional=True,
+            batch_first=True
+        )
+        # Directions*Hidden_Dim
+        self._output_dim = 2*hidden_dim
+        self._padding_idx: int = padding_idx
+
+    def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        # inp = [src sent len, batch size, emb dim]
+        # packed_outputs = [src sent len, batch size, hid dim * n directions]
+        inp = rnn_utils.pack_padded_sequence(inp, lengths=inp_length.cpu(), batch_first=True)
+        output, _ = self._nn(inp)
+        return rnn_utils.pad_packed_sequence(
+            output,
+            padding_value=self._padding_idx,
+            batch_first=True
+        )[0]
+
+    def init_weights(self):
+        for name, param in self.named_parameters():
+            nn.init.uniform_(param.data, -0.08, 0.08)
+
+
+class BiGru(ModelWrapper):
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim: int,
+        layers: int = 1,
+        padding_idx = 0
+    ):
+        super(BiGru, self).__init__(input_dim=input_dim, use_positional=False)
+        self._nn = nn.GRU(
+            input_size=input_dim,
+            hidden_size=hidden_dim,
+            num_layers=layers,
+            bidirectional=True,
+            batch_first=True
+        )
+        # Directions*Hidden_Dim
+        self._output_dim = 2 * hidden_dim
+        self._padding_idx: int = padding_idx
+
+    def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        # inp = [src sent len, batch size, emb dim]
+        # packed_outputs = [src sent len, batch size, hid dim * n directions]
+
+        inp = rnn_utils.pack_padded_sequence(inp, lengths=inp_length.cpu(), batch_first=True)
+        output, _ = self._nn(inp)
+        return rnn_utils.pad_packed_sequence(
+            output,
+            padding_value=self._padding_idx,
+            batch_first=True
+        )[0]
+
+    def init_weights(self):
+        for name, param in self.named_parameters():
+            nn.init.uniform_(param.data, -0.08, 0.08)
+
+
+class SequentialConv(ModelWrapper):
+    # https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb
+    def __init__(
+        self,
+        input_dim,
+        filter_dim,
+        n_layers,
+        filter_size,
+        use_sum: Optional[str] = None,
+        dropout: Optional[float] = None
+    ):
+        super().__init__(input_dim=input_dim)
+
+        assert filter_size % 2 == 1, "Filter size must be odd!"
+
+        self._output_dim = input_dim
+        self._filter_size = filter_size
+
+        self._scale = torch.sqrt(torch.FloatTensor([0.5]))
+        self._inp_to_filter = nn.Linear(input_dim, filter_dim)
+        self._filter_to_inp = nn.Linear(filter_dim, input_dim)
+
+        self._nns = nn.ModuleList([
+            nn.Conv1d(in_channels=filter_dim, out_channels=2 * filter_dim,
+                      kernel_size=filter_size, padding=(filter_size - 1) // 2)
+            for _ in range(n_layers)
+        ])
+
+        self._dropout: Optional[nn.Dropout] = None
+        if dropout:
+            self._dropout = nn.Dropout(dropout)
+
+        self._use_sum: Optional[str] = use_sum
+
+    def dropout(self, x):
+        if self._dropout is not None:
+            return self._dropout(x)
+        return x
+
+    def _forward(self, inp: torch.Tensor, inp_length: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # pass embedded through linear layer to go through emb dim -> hid dim
+        # conv_input = [batch size, src sent len, hid dim]
+        conv_input = self._inp_to_filter(inp)
+
+        # permute for convolutional layer
+        conv_input = conv_input.permute(0, 2, 1)
+
+        # conv_input = [batch size, hid dim, src sent len]
+        self._scale = self._scale.type_as(conv_input)
+
+        conved: Optional[torch.Tensor] = None
+
+        for i, conv in enumerate(self._nns):
+            # pass through convolutional layer
+            conved = conv(self.dropout(conv_input))
+
+            # conved = [batch size, 2*hid dim, src sent len]
+
+            # pass through GLU activation function
+            conved = F.glu(conved, dim=1)
+
+            # conved = [batch size, hid dim, src sent len]
+
+            # apply residual connection
+            conved = (conved + conv_input) * self._scale
+
+            # conved = [batch size, hid dim, src sent len]
+
+            # set conv_input to conved for next lo`op iteration
+            conv_input = conved
+
+        # permute and convert back to emb dim
+        conved = self._filter_to_inp(conved.permute(0, 2, 1))
+
+        if self._use_sum:
+            # conved = [batch size, src sent len, emb dim]
+
+            # elementwise sum output (conved) and input (embedded) to be used for attention
+            combined = (conved + inp) * self._scale
+
+            # combined = [batch size, src sent len, emb dim]
+            return combined
+        return conved
diff --git a/boudams/tagger.py b/boudams/tagger.py
index 663d621..84be004 100644
--- a/boudams/tagger.py
+++ b/boudams/tagger.py
@@ -7,13 +7,9 @@
 import logging
 import regex as re
 from dataclasses import dataclass
+from collections import OrderedDict
 from typing import List, Any, Optional, Dict, ClassVar, Tuple
 
-from boudams.model import linear
-from boudams import utils
-
-from .encoder import LabelEncoder
-
 import pytorch_lightning as pl
 import torch.nn as nn
 import torch.optim as optim
@@ -21,6 +17,9 @@
 import torchmetrics
 
 from boudams.utils import improvement_on_min_or_max
+from boudams.modules import *
+from boudams import utils
+from boudams.encoder import LabelEncoder
 
 teacher_forcing_ratio = 0.5
 
@@ -28,6 +27,31 @@
 MAX_LENGTH = 150
 
 
+_re_embedding = re.compile(r"E(\d+)")
+_re_conv = re.compile(r"C(\d+),(\d+)(?:,(\d+))?")
+_re_sequential_conv = re.compile(r"CS(s)?(\d+),(\d+),(\d+)(?:,Do(0?\.\d+))")
+_re_pos = re.compile(r"P(l)?")
+_re_bilstm = re.compile(r"L(\d+),(\d+)")
+_re_bigru = re.compile(r"G(\d+),(\d+)")
+_re_linear = re.compile(r"L(\d+)")
+_re_dropout = re.compile(r"Do(0?\.\d+)")
+
+
+def _map_params(iterable):
+    def weird_float(x):
+        if x.startswith("."):
+            return x[1:].isnumeric()
+        return x.isnumeric()
+
+    def eval_weird(x):
+        if x.startswith("."):
+            return float(f"0{x}")
+        else:
+            return eval(x)
+
+    return (eval_weird(x) if x and weird_float(x) else x for x in iterable)
+
+
 class CrossEntropyLoss(pl.LightningModule):
     def __init__(self, pad_index, weights=None):
         super(CrossEntropyLoss, self).__init__()
@@ -97,31 +121,121 @@ def get_optimizer(
         return optimizer, scheduler
 
 
+class ArchitectureStringError(ValueError):
+    """ Error raised with wrong architecture string"""
+
+
+def parse_architecture(
+   string: str,
+   maximum_sentence_size: Optional[int] = None
+) -> Tuple[int, nn.ModuleDict, int]:
+    """ Returns an embedding dimension and a module list
+
+    >>> BoudamsTagger.parse_architecture("[E200 C5,10]")
+
+    Should result in `(200, nn.ModuleList([Conv(200, out_filters=10, filter_size=5)]))`
+
+    >>> BoudamsTagger.parse_architecture("[E200 C5,10,2]")
+    Should result in `(200, nn.ModuleList([Conv(200, out_filters=10, filter_size=5, padding_size=2)]))`
+    """
+    if string[0] != "[" and string[-1] != "]":
+        raise ArchitectureStringError("Architectures need top be encapsulated in [ ]")
+    string = string[1:-1]
+    modules: List[ModelWrapper] = []
+    names: List[str] = []
+    emb_dim = 0
+    last_dim: int = 0
+    for idx, module in enumerate(string.split()):
+        if idx == 0:
+            if _re_embedding.match(module):
+                emb_dim = eval(_re_embedding.match(module).group(1))
+                last_dim = emb_dim
+            else:
+                raise ArchitectureStringError("First module needs to be an embedding module. Start with [E<d>...]"
+                                              " where <d> is a dimension, such as [E200")
+        elif _re_embedding.match(module):
+            raise ArchitectureStringError("You can't have embeddings after the first module")
+        elif _re_conv.match(module):
+            ngram, filter_dim, padding = _map_params(_re_conv.match(module).groups())
+            modules.append(Conv(
+                input_dim=last_dim,
+                out_filters=filter_dim,
+                filter_size=ngram,
+                **(dict(padding_size=padding) if padding else {})
+            ))
+        elif _re_sequential_conv.match(module):
+            use_sum, ngram, filter_dim, layers, drop = _map_params(_re_sequential_conv.match(module).groups())
+            modules.append(SequentialConv(
+                input_dim=last_dim,
+                filter_dim=filter_dim,
+                filter_size=ngram,
+                n_layers=layers,
+                dropout=drop,
+                use_sum=use_sum or ""
+            ))
+        elif _re_pos.match(module):
+            activation, = _map_params(_re_pos.match(module).groups())
+            modules.append(PosEmbedding(
+                input_dim=last_dim,
+                maximum_sentence_size=maximum_sentence_size,
+                activation=activation
+            ))
+        elif _re_bilstm.match(module):
+            hidden_dim, layers = _map_params(_re_bilstm.match(module).groups())
+            modules.append(BiLSTM(
+                input_dim=last_dim,
+                hidden_dim=hidden_dim,
+                layers=layers
+            ))
+        elif _re_bigru.match(module):
+            hidden_dim, layers = _map_params(_re_bigru.match(module).groups())
+            modules.append(BiGru(
+                input_dim=last_dim,
+                hidden_dim=hidden_dim,
+                layers=layers
+            ))
+        elif _re_linear.match(module):
+            dim, = _map_params(_re_linear.match(module).groups())
+            modules.append(Linear(
+                input_dim=last_dim,
+                output_dim=dim
+            ))
+        elif _re_dropout.match(module):
+            rate, = _map_params(_re_dropout.match(module).groups())
+
+            modules.append(Dropout(
+                input_dim=last_dim,
+                rate=rate
+            ))
+        else:
+            raise ArchitectureStringError(f"Unknown `{module}` architecture")
+
+        if len(modules):
+            last_dim = modules[-1].output_dim
+            names.append(module.replace(".", "_"))
+
+    return emb_dim, nn.ModuleDict(OrderedDict(zip(names, modules))), last_dim
+
+
 class BoudamsTagger(pl.LightningModule):
+
     def __init__(
             self,
             vocabulary: LabelEncoder,
+            architecture: str = "[E256 G256,2 D.3]",
             metric_average: str = "macro",
-            hidden_size: int = 256,
-            enc_n_layers: int = 10,
-            emb_enc_dim: int = 256,
-            enc_hid_dim: int = None,
-            enc_dropout: float = 0.5,
-            enc_kernel_size: int = 3,
-            out_max_sentence_length: int = 150,
+            maximum_sentence_size: int = 150,
             optimizer: Optional[OptimizerParams] = None,
-            system: str = "bi-gru",
-            have_metrics: bool = False,
-            **kwargs # RetroCompat
+            have_metrics: bool = False
     ):
         """
 
         :param vocabulary:
-        :param hidden_size:
-        :param n_layers:
-        :param emb_enc_dim:
-        :param emb_dec_dim:
-        :param max_length:
+        :param architecture:
+        :param metric_average:
+        :param maximum_sentence_size:
+        :param optimizer:
+        :param have_metrics:
         """
         super(BoudamsTagger, self).__init__()
 
@@ -133,31 +247,31 @@ def __init__(
             self.optimizer_params.validate()
             self.lr = self.optimizer_params.kwargs.get("lr")
 
-        # Parse params and sizes
-        self.enc_hid_dim = self.dec_hid_dim = self.hidden_size = hidden_size
-
-        if enc_hid_dim:
-            self.enc_hid_dim: int = enc_hid_dim
-
-        self.emb_enc_dim: int = emb_enc_dim
-        self.enc_dropout: float = enc_dropout
-        self.enc_kernel_size: int = enc_kernel_size
-        self.enc_n_layers: int = enc_n_layers
-
-        self.out_max_sentence_length: int = out_max_sentence_length
-        self.system: str = system
+        self._nb_classes = len(self.vocabulary.itom)
 
+        # Parse params and sizes
+        self._architecture: str = architecture
+        _emb_dims, sequence, last_dim = parse_architecture(
+            architecture,
+            maximum_sentence_size=maximum_sentence_size
+        )
+        self._maximum_sentence_size: Optional[int] = maximum_sentence_size
+        self._emb_dims = _emb_dims
+        self._module_dict: nn.ModuleDict = sequence
         # Based on self.masked, decoder dimension can be drastically different
-        self.dec_dim = len(self.vocabulary.itom)
-
-        # Build the module
-        self._build_nn()
+        self._embedder = nn.Embedding(self.vocabulary_dimension, self._emb_dims)
+        self._classifier = nn.Linear(last_dim, self._nb_classes)
 
         if self.optimizer_params:
             # ToDo: Allow for DiceLoss
-            self.train_loss = CrossEntropyLoss(weights=self.model.nll_weight,
+            # Needed when loading dict
+            nll_weight = torch.ones(self._nb_classes)
+            nll_weight[vocabulary.pad_token_index] = 0.
+            self.register_buffer('nll_weight', nll_weight)
+
+            self.train_loss = CrossEntropyLoss(weights=self.nll_weight,
                                                pad_index=self.vocabulary.pad_token_index)
-            self.val_loss = CrossEntropyLoss(weights=self.model.nll_weight,
+            self.val_loss = CrossEntropyLoss(weights=self.nll_weight,
                                              pad_index=self.vocabulary.pad_token_index)
 
         if metric_average not in {"micro", "macro"}:
@@ -184,55 +298,6 @@ def add_metrics(self, prefix, metric_average):
             multiclass=True
         ))
 
-    def _build_nn(self):
-        seq2seq_shared_params = {
-            "pad_idx": self.padtoken,
-            "out_max_sentence_length": self.out_max_sentence_length
-        }
-
-        if self.system.endswith("-lstm"):
-            self.enc: linear.LSTMEncoder = linear.LinearLSTMEncoder(
-                    self.vocabulary_dimension, emb_dim=self.emb_enc_dim,
-                    n_layers=self.enc_n_layers, hid_dim=self.enc_hid_dim,
-                    dropout=self.enc_dropout
-                )
-            in_features = self.enc.output_dim
-        elif self.system.endswith("-gru"):
-            self.enc: linear.BiGruEncoder = linear.BiGruEncoder(
-                    self.vocabulary_dimension, emb_dim=self.emb_enc_dim,
-                    n_layers=self.enc_n_layers, hid_dim=self.enc_hid_dim,
-                    dropout=self.enc_dropout
-                )
-            in_features = self.enc.output_dim
-        elif self.system.endswith("-conv-no-pos"):
-            self.enc: linear.LinearEncoderCNNNoPos = linear.LinearEncoderCNNNoPos(
-                    self.vocabulary_dimension, emb_dim=self.emb_enc_dim,
-                    n_layers=self.enc_n_layers, hid_dim=self.enc_hid_dim,
-                    dropout=self.enc_dropout,
-                    kernel_size=self.enc_kernel_size
-                )
-            in_features = self.emb_enc_dim
-            # This model does not need sentence length
-            self.out_max_sentence_length = None
-        else:
-            self.enc: linear.CNNEncoder = linear.LinearEncoderCNN(
-                    self.vocabulary_dimension, emb_dim=self.emb_enc_dim,
-                    n_layers=self.enc_n_layers, hid_dim=self.enc_hid_dim,
-                    dropout=self.enc_dropout,
-                    kernel_size=self.enc_kernel_size,
-                    max_sentence_len=self.out_max_sentence_length
-                )
-            in_features = self.emb_enc_dim
-
-        self.dec: linear.LinearDecoder = linear.LinearDecoder(
-            enc_dim=in_features, out_dim=self.vocabulary.mode.classes_count
-        )
-        self.model: linear.MainModule = linear.MainModule(
-            self.enc, self.dec,
-            pos="nopos" not in self.system,
-            **seq2seq_shared_params
-        )
-
     @property
     def padtoken(self):
         return self.vocabulary.pad_token_index
@@ -240,41 +305,15 @@ def padtoken(self):
     @property
     def settings(self):
         return {
-            "enc_kernel_size": self.enc_kernel_size,
-            "enc_n_layers": self.enc_n_layers,
-            "hidden_size": self.hidden_size,
-            "enc_hid_dim": self.enc_hid_dim,
-            "emb_enc_dim": self.emb_enc_dim,
-            "enc_dropout": self.enc_dropout,
-            "out_max_sentence_length": self.out_max_sentence_length,
-            "system": self.system
+            "architecture": self._architecture,
+            "maximum_sentence_size": self._maximum_sentence_size
         }
 
-    @classmethod
-    def load(cls, fpath="./model.boudams_model"):
-        with tarfile.open(utils.ensure_ext(fpath, 'boudams_model'), 'r') as tar:
-            settings = json.loads(utils.get_gzip_from_tar(tar, 'settings.json.zip'))
-
-            # load state_dict
-            #print(json.loads(utils.get_gzip_from_tar(tar, "vocabulary.json")))
-            vocab = LabelEncoder.load(
-                json.loads(utils.get_gzip_from_tar(tar, "vocabulary.json"))
-            )
-
-            obj = cls(vocabulary=vocab, **settings)
-
-            # load state_dict
-            with utils.tmpfile() as tmppath:
-                tar.extract('state_dict.pt', path=tmppath)
-                dictpath = os.path.join(tmppath, 'state_dict.pt')
-                obj.model.load_state_dict(torch.load(dictpath))
-
-        obj.model.eval()
-
-        return obj
-
-    def forward(self, x: torch.TensorType, x_len: Optional[torch.TensorType] = None) -> Any:
-        return self.model.forward(x, x_len)
+    def forward(self, x: torch.TensorType, x_len: Optional[torch.TensorType] = None) -> torch.Tensor:
+        after_seq_out = self._embedder(x)
+        for module in self._module_dict.values():
+            after_seq_out, x_len = module(after_seq_out, x_len)
+        return self._classifier(after_seq_out)
 
     def training_step(self, batch, batch_idx):  # -> pl.utilities.types.STEP_OUTPUT:
         """ Runs training step on a batch
@@ -361,7 +400,7 @@ def _eval_step(self, batch, batch_idx, prefix: str):
         return x, y, gt, {"confusion_matrix": matrix}
 
     def _view_y_gt(self, y, gt):
-        return y.view(-1, self.model.decoder.out_dim), gt.view(-1)
+        return y.view(-1, self._nb_classes), gt.view(-1)
 
     def configure_optimizers(self):
         optimizer, scheduler = self.optimizer_params.get_optimizer(
@@ -381,11 +420,15 @@ def configure_optimizers(self):
         )
 
     def annotate(self, texts: List[str], batch_size=32, device: str = "cpu"):
-        self.model.eval()
+        self.eval()
         for n in range(0, len(texts), batch_size):
             batch = texts[n:n+batch_size]
             xs = [
-                self.vocabulary.sent_to_numerical(self.vocabulary.prepare(s))
+                self.vocabulary.sent_to_numerical(
+                    self.vocabulary.mode.prepare_input(
+                        self.vocabulary.prepare(s)
+                    )
+                )
                 for s in batch
             ]
             logging.info("Dealing with batch %s " % (int(n/batch_size)+1))
@@ -397,8 +440,8 @@ def annotate(self, texts: List[str], batch_size=32, device: str = "cpu"):
             if device != "cpu":
                 tensor, sentence_length = tensor.to(device), sentence_length.to(device)
 
-            translations = self.model.predict(
-                tensor, sentence_length, label_encoder=self.vocabulary,
+            translations = self._string_predict(
+                tensor, sentence_length,
                 override_src=[batch[order_id] for order_id in order]
             )
             for index in range(len(translations)):
@@ -412,9 +455,9 @@ def annotate_text(self, string, splitter=r"([⁊\W\d]+)", batch_size=32, device:
         strings = ["".join(tempList[n:n + 2]) for n in range(0, len(splits), 2)]
         strings = list(filter(lambda x: x.strip(), strings))
 
-        if self.out_max_sentence_length:
+        if self._maximum_sentence_size:
             treated = []
-            max_size = self.out_max_sentence_length - 5
+            max_size = self._maximum_sentence_size
             for string in strings:
                 if len(string) > max_size:
                     treated.extend([
@@ -426,6 +469,28 @@ def annotate_text(self, string, splitter=r"([⁊\W\d]+)", batch_size=32, device:
             strings = treated
         yield from self.annotate(strings, batch_size=batch_size, device=device)
 
+    @classmethod
+    def load(cls, fpath="./model.boudams_model"):
+        with tarfile.open(utils.ensure_ext(fpath, 'boudams_model'), 'r') as tar:
+            settings = json.loads(utils.get_gzip_from_tar(tar, 'settings.json.zip'))
+
+            vocab = LabelEncoder.load(
+                json.loads(utils.get_gzip_from_tar(tar, "vocabulary.json"))
+            )
+
+            obj = cls(vocabulary=vocab, **settings)
+
+            # load state_dict
+            with utils.tmpfile() as tmppath:
+                tar.extract('state_dict.pt', path=tmppath)
+                dictpath = os.path.join(tmppath, 'state_dict.pt')
+                # Strict false for predict (nll_weight is removed)
+                obj.load_state_dict(torch.load(dictpath), strict=False)
+
+        obj.eval()
+
+        return obj
+
     def dump(self, fpath="model"):
         fpath += ".boudams_model"
         fpath = utils.ensure_ext(fpath, 'boudams_model', infix=None)
@@ -451,7 +516,26 @@ def dump(self, fpath="model"):
 
             # serialize field
             with utils.tmpfile() as tmppath:
-                torch.save(self.model.state_dict(), tmppath)
+                torch.save(self.state_dict(), tmppath)
                 tar.add(tmppath, arcname='state_dict.pt')
 
         return fpath
+
+    def _string_predict(
+            self,
+            src,
+            src_len,
+            override_src: Optional[List[str]] = None
+    ) -> torch.Tensor:
+        """ Predicts value for a given tensor
+        :param src: tensor(batch size x sentence_length)
+        :param src_len: tensor(batch size)
+        :param label_encoder: Encoder
+        :return: Reversed Batch
+        """
+        out = self(src, src_len)
+        logits = torch.argmax(out, -1)
+        return self.vocabulary.reverse_batch(
+            input_batch=src,
+            mask_batch=logits
+        )