Skip to content

Commit

Permalink
Adds a Pseudo-/Custom-VGSL system to Boudams (#24)
Browse files Browse the repository at this point in the history
* Remove legacy code
* Add modules that can be used in Custom-VGSL mode
  * Adapt former Sequential Conv
  * Document legacy structure
* Adapt the rest of the code
  • Loading branch information
PonteIneptique authored Apr 12, 2022
1 parent d8ed1d5 commit 57691ca
Show file tree
Hide file tree
Showing 11 changed files with 667 additions and 626 deletions.
19 changes: 14 additions & 5 deletions CustomVGSL.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ The new spec system is built around custom architecture strings.

Available modules:

- `C[A]<x>,<d>` uses a convolutional layer where `x` is the n-gram window and `d` the output.
- `CP[A]<x>,<d>` uses a convolutional layer with positional embeddings where `x` is the n-gram window and `d` the output.
- `L[A]<h>,<l>` uses a Bi-LSTM layer where `h` is the hidden size and `l` the number of layers.
- `G[A]<h>,<l>` uses a Bi-GRU layer where `h` is the hidden size and `l` the number of layers.
- `D<r>` uses a Dropout layer with a rate of `r`
- `C<x>,<d>[,<p>]` uses a convolutional layer where `x` is the n-gram window and `d` the output. `p` is an optional padding.
- `CS[s]<x>,<d>,<l>[,Do<r>]` uses a sequential convolutional layer where `x` is the "n-gram window", `d` the output, `l` the number. Can have an optional `Dropout` rate between each convolution.
of layers. `[s]` (`CSs`) use a final addition of conved output + original input with a scale
- `P[l]` adds a positional embeddings with an optional linear activation (eg. `Pl`).
- `L<h>,<l>` uses a Bi-LSTM layer where `h` is the hidden size and `l` the number of layers.
- `G<h>,<l>` uses a Bi-GRU layer where `h` is the hidden size and `l` the number of layers.
- `Do<r>` uses a Dropout layer with a rate of `r`
- `L<d>` uses a Linear layer of dimension `d`

`[A]` can be replaced with an activation layer, such as:
Expand All @@ -21,8 +23,15 @@ Available modules:
- `l` = linear (i.e., No non-linearity)
- `m` = softmax
- `n` = n/a
- `g` = GLU

The VGSL module must starts with an embedding size: `E<dim>`.

Example: `[E200 L120 L200 Cr3,10 D3]` will use a Convolutional Layer of (3 ngram for 10 of dim) and a relu activation
over which 30% of dropout is applied before classification

## Legacy architectures

- ConvPos `[E256 Pl Do.3 CS5,256,10,Do.25 L256]`
- ConvNoPos `[E256 Do.3 CS5,256,10,Do.25 L256]`
- Gru `[E256 Do.3 CSs5,256,10Do.25 L256]`
208 changes: 114 additions & 94 deletions boudams/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,18 @@ def template(filename):


@cli.command("train")
@click.argument("config_files", nargs=-1, type=click.File("r"))
@click.argument("train-set", type=click.Path(file_okay=True, exists=True, dir_okay=False))
@click.argument("dev-set", type=click.Path(file_okay=True, exists=True, dir_okay=False))
@click.argument("test-set", type=click.Path(file_okay=True, exists=True, dir_okay=False))
@click.argument("output", type=click.Path(dir_okay=False, exists=False))
@click.option("--architecture", type=str, help="VGSL-Like architecture.",
default="[E256 Pl Do.3 CSs5,256,10Do.25 L256]", show_default=True)
@click.option("--mode", type=click.Choice(_POSSIBLE_MODES),
default="simple-space", show_default=True,
help="Type of encoder you want to set-up")
@click.option("--output", type=click.Path(dir_okay=False, exists=False), default=None, help="Model Name")
@click.option("--normalize", type=bool, is_flag=True, default=False, help="Normalize string input with unidecode"
" or mufidecode")
@click.option("--lower", type=bool, is_flag=True, default=False, help="Lower strings")
@click.option("--epochs", type=int, default=100, help="Number of epochs to run")
@click.option("--batch_size", type=int, default=32, help="Size of batches")
@click.option("--device", default="cpu", help="Device to use for the network (cuda:0, cpu, etc.)")
Expand All @@ -194,21 +201,35 @@ def template(filename):
@click.option("--metric", default="f1", type=click.Choice(ACCEPTABLE_MONITOR_METRICS), help="Metric to monitor")
@click.option("--avg", default="macro", type=click.Choice(["micro", "macro"]), help="Type of avering method to use on "
"metrics")
@click.option("--lr", default=.0001, type=float, help="Learning rate",
show_default=True)
@click.option("--delta", default=.001, type=float, help="Minimum change in the monitored quantity to qualify as an "
"improvement")
@click.option("--patience", default=3, type=int, help="Number of checks with no improvement after which training "
"will be stopped")
"improvement",
show_default=True)
@click.option("--patience", default=5, type=int, help="Number of checks with no improvement after which training "
"will be stopped",
show_default=True)
@click.option("--lr-patience", default=3, type=int, help="Number of checks with no improvement for lowering LR",
show_default=True)
@click.option("--shuffle/--no-shuffle", type=bool, is_flag=True, default=True,
help="Suppress the shuffling of datasets", show_default=True)
@click.option("--lr-factor", default=.5, type=float, help="Ratio for lowering LR", show_default=True)
@click.option("--seed", default=None, type=int, help="Runs deterministic training")
@click.option("--optimizer", default="Adams", type=click.Choice(["Adams"]), help="Optimizer to use")
# ToDo: Figure out the bug with Ranger
# pytorch_lightning.utilities.exceptions.MisconfigurationException: The closure hasn't been executed. HINT: did you call
# `optimizer_closure()` in your `optimizer_step` hook? It could also happen because the
# `optimizer.step(optimizer_closure)` call did not execute it internally.
def train(config_files: List[click.File], output: str, mode: str,
epochs: int, batch_size: int, device: str, debug: bool, workers: int,
auto_lr: bool,
metric: str, avg: str, delta: float, patience: int,
seed: int, optimizer: str):
def train(
train_set: str, dev_set: str, test_set: str,
architecture: str, output: str, mode: str,
normalize: bool, lower: bool,
epochs: int, batch_size: int, device: str, debug: bool, workers: int,
auto_lr: bool,
metric: str, avg: str,
lr: float, delta: float, patience: int,
lr_patience: int, lr_factor: float,
seed: int, optimizer: str, shuffle: bool):
""" Train one or more models according to [CONFIG_FILES] JSON configurations"""
if debug:
logger.setLevel(logging.DEBUG)
Expand All @@ -218,100 +239,99 @@ def train(config_files: List[click.File], output: str, mode: str,
if seed:
pl.seed_everything(seed, workers=True)

device = device.lower()
if device == 'cpu':
device = None
elif device.startswith('cuda'):
device = [int(device.split(':')[-1])]
else:
click.echo(click.style("Device is invalid. Either use `cpu` or `cuda:0`, `cuda:1`", fg="red"))
return

for config_file in config_files:
config = json.load(config_file)

train_path, dev_path, test_path = config["datasets"]["train"],\
config["datasets"]["dev"],\
config["datasets"]["test"]

vocabulary = LabelEncoder(
maximum_length=config.get("max_sentence_size", None),
mode=mode,
remove_diacriticals=config["label_encoder"].get("normalize", True),
lower=config["label_encoder"].get("lower", True)
)
vocabulary.build(train_path, dev_path, test_path, debug=True)
if debug:
from pprint import pprint
pprint(vocabulary.mtoi)

# Get the datasets
train_dataset: BoudamsDataset = vocabulary.get_dataset(train_path)
dev_dataset: BoudamsDataset = vocabulary.get_dataset(dev_path)
test_dataset: BoudamsDataset = vocabulary.get_dataset(test_path)

logger.info("Training %s " % config_file.name)
logger.info("-- Dataset informations --")
logger.info(f"Number of training examples: {len(train_dataset)}")
logger.info(f"Number of dev examples: {len(dev_dataset)}")
logger.info(f"Number of testing examples: {len(test_dataset)}")
logger.info(f"Vocabulary Size: {len(vocabulary)}")
logger.info("--------------------------")
train_path, dev_path, test_path = train_set, dev_set, test_set

tagger = BoudamsTagger(
vocabulary,
system=config["model"],
out_max_sentence_length=config.get("max_sentence_size", None),
metric_average=avg,
optimizer=OptimizerParams(
optimizer,
kwargs={"lr": config["learner"]["lr"]},
scheduler={
"patience": config["learner"].get("lr_patience", None),
"factor": config["learner"].get("lr_factor", None),
"threshold": delta
}
),
**config["network"]
)
trainer = Trainer(
gpus=device,
patience=patience,
min_delta=delta,
monitor=metric,
max_epochs=epochs,
gradient_clip_val=1,
model_name=output or (config["name"] + str(datetime.datetime.today()).replace(" ", "--").split(".")[0]),
# n_epochs=epochs,
auto_lr_find=auto_lr,
deterministic=True if seed else False
vocabulary = LabelEncoder(
mode=mode,
remove_diacriticals=normalize,
lower=lower
)
maximum_sentence_size = vocabulary.build(train_path, dev_path, test_path, debug=True)
if debug:
from pprint import pprint
pprint(vocabulary.mtoi)

# Get the datasets
train_dataset: BoudamsDataset = vocabulary.get_dataset(train_path)
dev_dataset: BoudamsDataset = vocabulary.get_dataset(dev_path)
test_dataset: BoudamsDataset = vocabulary.get_dataset(test_path)

logger.info("Architecture %s " % architecture)
logger.info("-- Dataset informations --")
logger.info(f"Number of training examples: {len(train_dataset)}")
logger.info(f"Number of dev examples: {len(dev_dataset)}")
logger.info(f"Number of testing examples: {len(test_dataset)}")
logger.info(f"Vocabulary Size: {len(vocabulary)}")
logger.info("--------------------------")

tagger = BoudamsTagger(
vocabulary,
architecture=architecture,
maximum_sentence_size=maximum_sentence_size,
metric_average=avg,
optimizer=OptimizerParams(
optimizer,
kwargs={"lr": lr},
scheduler={
"patience": lr_patience,
"factor": lr_factor,
"threshold": delta
}
)
train_dataloader, dev_dataloader = (
DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=config["datasets"].get("random", True),
collate_fn=train_dataset.train_collate_fn,
num_workers=workers
),
DataLoader(
dev_dataset,
batch_size=batch_size,
shuffle=config["datasets"].get("random", True),
collate_fn=dev_dataset.train_collate_fn,
num_workers=workers
)
)
trainer = Trainer(
gpus=device,
patience=patience,
min_delta=delta,
monitor=metric,
max_epochs=epochs,
gradient_clip_val=1,
model_name=output,
# n_epochs=epochs,
auto_lr_find=auto_lr,
deterministic=True if seed else False
)
train_dataloader, dev_dataloader = (
DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=shuffle,
collate_fn=train_dataset.train_collate_fn,
num_workers=workers
),
DataLoader(
dev_dataset,
batch_size=batch_size,
shuffle=False,
collate_fn=dev_dataset.train_collate_fn,
num_workers=workers
)
if auto_lr:
trainer.tune(tagger, train_dataloader, dev_dataloader)
return
trainer.fit(tagger, train_dataloader, dev_dataloader)
)

trainer.test(
tagger,
DataLoader(
test_dataset,
batch_size=batch_size,
collate_fn=test_dataset.train_collate_fn,
num_workers=workers
)
if auto_lr:
trainer.tune(tagger, train_dataloader, dev_dataloader)
return
trainer.fit(tagger, train_dataloader, dev_dataloader)

trainer.test(
tagger,
DataLoader(
test_dataset,
batch_size=batch_size,
collate_fn=test_dataset.train_collate_fn,
num_workers=workers,
shuffle=False
)
)


@cli.command("test")
Expand Down
12 changes: 9 additions & 3 deletions boudams/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,22 +76,26 @@ def mode(self):
def __len__(self):
return len(self.stoi)

def build(self, *paths, debug=False):
def build(self, train, *paths, debug=False) -> int:
""" Builds vocabulary
:param paths: Path of file to read
:return:
:return: Maximum sentence size
"""
recorded_chars = set()
counter = None
if debug:
counter = collections.Counter()

logging.info("Reading files for vocabulary building")
for path in paths:
max_sentence_size = 0
for path_idx, path in enumerate([train, *paths]):
with open(path) as fio:
for line in fio.readlines():
x, _ = self.readunit(line)
seq_len = len(x)
if seq_len > max_sentence_size:
max_sentence_size = seq_len
recorded_chars.update(set(list(x)))

logging.info("Saving {} chars to label encoder".format(len(recorded_chars)))
Expand All @@ -102,6 +106,8 @@ def build(self, *paths, debug=False):
# Reuse index for string retrieval
self.itos[self.stoi[char]] = char

return max_sentence_size

def readunit(self, line) -> Tuple[str, str]:
""" Read a single line
Expand Down
Empty file removed boudams/model/__init__.py
Empty file.
43 changes: 0 additions & 43 deletions boudams/model/bidir.py

This file was deleted.

Loading

0 comments on commit 57691ca

Please sign in to comment.