Skip to content

Commit

Permalink
Merge branch 'release/3.0.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
hbredin committed Sep 22, 2023
2 parents 8fbf376 + 541caf4 commit 4322750
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 32 deletions.
11 changes: 11 additions & 0 deletions doc/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@
Changelog
#########

Version 3.0.0 (2023-09-22)
~~~~~~~~~~~~~~~~~~~~~~~~~~

- BREAKING(cli): switch to latest pyannote.database API
- feat: add "seed" parameter for reproducible optimization
- feat(cli): add "device" section in configuration file
- feat(cli): add "--registry" option for custom database loading
- feat(cli): add "--average-case" option to optimize for average case
- setup: switch to optuna 3.1+
- feat: add support for optuna Journal storage

Version 2.3 (2022-06-16)
~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
52 changes: 35 additions & 17 deletions pyannote/pipeline/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# The MIT License (MIT)

# Copyright (c) 2018-2020 CNRS
# Copyright (c) 2018- CNRS

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -38,10 +38,11 @@
Common options:
<database.task.protocol> Experimental protocol (e.g. "Etape.SpeakerDiarization.TV")
--database=<db.yml> Path to database configuration file.
--registry=<db.yml> Path to, comma-separated, database configuration files.
[default: ~/.pyannote/db.yml]
--subset=<subset> Set subset. Defaults to 'development' in "train"
mode, and to 'test' in "apply" mode.
"train" mode:
<experiment_dir> Set experiment root directory. This script expects
a configuration file called "config.yml" to live
Expand All @@ -57,6 +58,7 @@
bootstrap the optimization process. In practice,
this will simply run a first trial with this set
of parameters.
--average-case Optimize for average case instead of worst case.
"apply" mode:
<train_dir> Path to the directory containing trained hyper-
Expand Down Expand Up @@ -88,6 +90,10 @@
speech_activity_detection:
onset: 0.5
offset: 0.5
# pyannote.audio pipelines will run on CPU by default.
# use `device` key to send it to GPU.
device: cuda
...................................................................
"train" mode:
Expand Down Expand Up @@ -115,7 +121,7 @@
from datetime import datetime

from pyannote.database import FileFinder
from pyannote.database import get_protocol
from pyannote.database import registry
from pyannote.database import get_annotated

from pyannote.core.utils.helper import get_class_by_name
Expand Down Expand Up @@ -161,7 +167,6 @@ def from_train_dir(cls, train_dir: Path, training: bool = False) -> "Experiment"
return xp

def __init__(self, experiment_dir: Path, training: bool = False):

super().__init__()

self.experiment_dir = experiment_dir
Expand All @@ -174,7 +179,6 @@ def __init__(self, experiment_dir: Path, training: bool = False):
# initialize preprocessors
preprocessors = {}
for key, preprocessor in self.config_.get("preprocessors", {}).items():

# preprocessors:
# key:
# name: package.module.ClassName
Expand Down Expand Up @@ -208,11 +212,18 @@ def __init__(self, experiment_dir: Path, training: bool = False):
)
self.pipeline_ = Klass(**self.config_["pipeline"].get("params", {}))

# freeze parameters
# freeze parameters
if "freeze" in self.config_:
params = self.config_["freeze"]
self.pipeline_.freeze(params)

# send to device
if "device" in self.config_:
import torch

device = torch.device(self.config_["device"])
self.pipeline_.to(device)

def train(
self,
protocol_name: str,
Expand All @@ -221,6 +232,7 @@ def train(
n_iterations: int = 1,
sampler: Optional[str] = None,
pruner: Optional[str] = None,
average_case: bool = False,
):
"""Train pipeline
Expand All @@ -240,6 +252,8 @@ def train(
Choose sampler between RandomSampler and TPESampler
pruner : `str`, optional
Choose between MedianPruner or SuccessiveHalvingPruner.
average_case : `bool`, optional
Optimise for average case. Defaults to False (i.e. worst case).
"""
train_dir = Path(
self.TRAIN_DIR.format(
Expand All @@ -250,15 +264,18 @@ def train(
)
train_dir.mkdir(parents=True, exist_ok=True)

protocol = get_protocol(protocol_name, preprocessors=self.preprocessors_)
protocol = registry.get_protocol(
protocol_name, preprocessors=self.preprocessors_
)

study_name = "default"
optimizer = Optimizer(
self.pipeline_,
db=train_dir / "iterations.db",
db=train_dir / "trials.journal",
study_name=study_name,
sampler=sampler,
pruner=pruner,
average_case=average_case,
)

direction = 1 if self.pipeline_.get_direction() == "minimize" else -1
Expand Down Expand Up @@ -290,7 +307,6 @@ def train(
count = itertools.count() if n_iterations < 0 else range(n_iterations)

for i, status in zip(count, iterations):

loss = status["loss"]

if direction * loss < direction * best_loss:
Expand Down Expand Up @@ -326,7 +342,7 @@ def best(self, protocol_name: str, subset: str = "development"):

study_name = "default"
optimizer = Optimizer(
self.pipeline_, db=train_dir / "iterations.db", study_name=study_name
self.pipeline_, db=train_dir / "trials.journal", study_name=study_name
)

try:
Expand Down Expand Up @@ -356,7 +372,9 @@ def apply(
"""

# file generator
protocol = get_protocol(protocol_name, preprocessors=self.preprocessors_)
protocol = registry.get_protocol(
protocol_name, preprocessors=self.preprocessors_
)

# load pipeline metric (when available)
try:
Expand All @@ -369,12 +387,10 @@ def apply(
output_dir / f"{protocol_name}.{subset}.{self.pipeline_.write_format}"
)
with open(output_ext, mode="w") as fp:

files = list(getattr(protocol, subset)())

desc = f"Processing {protocol_name} ({subset})"
for current_file in tqdm(iterable=files, desc=desc, unit="file"):

# apply pipeline and dump output to file
output = self.pipeline_(current_file)
self.pipeline_.write(fp, output)
Expand Down Expand Up @@ -412,14 +428,15 @@ def apply(


def main():

arguments = docopt(__doc__, version="Tunable pipelines")

for database_yml in arguments["--registry"].split(","):
registry.load_database(database_yml)

protocol_name = arguments["<database.task.protocol>"]
subset = arguments["--subset"]

if arguments["train"]:

if subset is None:
subset = "development"

Expand All @@ -435,6 +452,8 @@ def main():
if pretrained:
pretrained = Path(pretrained).expanduser().resolve(strict=True)

average_case = arguments["--average-case"]

experiment_dir = Path(arguments["<experiment_dir>"])
experiment_dir = experiment_dir.expanduser().resolve(strict=True)

Expand All @@ -446,10 +465,10 @@ def main():
pretrained=pretrained,
sampler=sampler,
pruner=pruner,
average_case=average_case,
)

if arguments["best"]:

if subset is None:
subset = "development"

Expand All @@ -460,7 +479,6 @@ def main():
experiment.best(protocol_name, subset=subset)

if arguments["apply"]:

if subset is None:
subset = "test"

Expand Down
58 changes: 44 additions & 14 deletions pyannote/pipeline/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@
from optuna.pruners import BasePruner
from optuna.samplers import BaseSampler, TPESampler
from optuna.trial import Trial, FixedTrial
from optuna.storages import RDBStorage, JournalStorage, JournalFileStorage
from tqdm import tqdm
from scipy.stats import bayes_mvs
from optuna.storages import RDBStorage, JournalStorage, JournalFileStorage

from .pipeline import Pipeline
from .typing import PipelineInput
Expand All @@ -57,7 +58,9 @@ class Optimizer:
pipeline : `Pipeline`
Pipeline.
db : `Path`, optional
Path to iteration database on disk.
Path to trial database on disk. Use ".sqlite" extension for SQLite
backend, and ".journal" for Journal backend (prefered for parallel
optimization).
study_name : `str`, optional
Name of study. In case it already exists, study will continue from
there. # TODO -- generate this automatically
Expand All @@ -68,6 +71,11 @@ class Optimizer:
Algorithm for early pruning of trials. Must be one of "MedianPruner" or
"SuccessiveHalvingPruner", or a pruner instance.
Defaults to no pruning.
seed : `int`, optional
Seed value for the random number generator of the sampler.
Defaults to no seed.
average_case : `bool`, optional
Optimise for average case. Defaults to False (i.e. worst case).
"""

def __init__(
Expand All @@ -77,27 +85,37 @@ def __init__(
study_name: Optional[str] = None,
sampler: Optional[Union[str, BaseSampler]] = None,
pruner: Optional[Union[str, BasePruner]] = None,
seed: Optional[int] = None,
average_case: bool = True,
):

self.pipeline = pipeline

self.db = db
if db is None:
self.storage_ = None
else:
self.storage_ = f"sqlite:///{self.db}"
extension = Path(self.db).suffix
if extension == ".db":
warnings.warn(
"Storage with '.db' extension has been deprecated. Use '.sqlite' instead."
)
self.storage_ = RDBStorage(f"sqlite:///{self.db}")
elif extension == ".sqlite":
self.storage_ = RDBStorage(f"sqlite:///{self.db}")
elif extension == ".journal":
self.storage_ = JournalStorage(JournalFileStorage(f"{self.db}"))
self.study_name = study_name

if isinstance(sampler, BaseSampler):
self.sampler = sampler
elif isinstance(sampler, str):
try:
self.sampler = getattr(optuna.samplers, sampler)()
self.sampler = getattr(optuna.samplers, sampler)(seed=seed)
except AttributeError as e:
msg = '`sampler` must be one of "RandomSampler" or "TPESampler"'
raise ValueError(msg)
elif sampler is None:
self.sampler = TPESampler()
self.sampler = TPESampler(seed=seed)

if isinstance(pruner, BasePruner):
self.pruner = pruner
Expand All @@ -123,6 +141,8 @@ def __init__(
direction=self.pipeline.get_direction(),
)

self.average_case = average_case

@property
def best_loss(self) -> float:
"""Return best loss so far"""
Expand All @@ -140,7 +160,9 @@ def best_pipeline(self) -> Pipeline:
return self.pipeline.instantiate(self.best_params)

def get_objective(
self, inputs: Iterable[PipelineInput], show_progress: Union[bool, Dict] = False,
self,
inputs: Iterable[PipelineInput],
show_progress: Union[bool, Dict] = False,
) -> Callable[[Trial], float]:
"""
Create objective function used by optuna
Expand Down Expand Up @@ -199,7 +221,6 @@ def objective(trial: Trial) -> float:

# accumulate loss for each input
for i, input in enumerate(inputs):

# process input with pipeline
# (and keep track of processing time)
before_processing = time.time()
Expand Down Expand Up @@ -245,14 +266,24 @@ def objective(trial: Trial) -> float:
if len(np.unique(losses)) == 1:
mean = lower_bound = upper_bound = losses[0]
else:
(mean, (lower_bound, upper_bound)), _, _ = bayes_mvs(losses, alpha=0.9)
(mean, (lower_bound, upper_bound)), _, _ = bayes_mvs(
losses, alpha=0.9
)
else:
mean, (lower_bound, upper_bound) = metric.confidence_interval(alpha=0.9)

if self.pipeline.get_direction() == "minimize":
return upper_bound
else:
return lower_bound
if self.average_case:
if metric is None:
return mean

else:
return abs(metric)

return (
upper_bound
if self.pipeline.get_direction() == "minimize"
else lower_bound
)

return objective

Expand Down Expand Up @@ -336,7 +367,6 @@ def tune_iter(
self.study_.enqueue_trial(flattened_params)

while True:

# pipeline is currently being optimized
self.pipeline.training = True

Expand Down
3 changes: 3 additions & 0 deletions pyannote/pipeline/parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,6 @@ def __iter__(self):

def __getitem__(self, param_name):
return getattr(self, param_name)

def __getattr__(self, param_name):
return self.__getitem__(param_name)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"pyannote.core >= 4.0",
"pyannote.database >= 4.0",
"PyYAML >= 3.12",
"optuna >= 1.4",
"optuna >= 3.1",
"tqdm >= 4.29.1",
"docopt >= 0.6.2",
"filelock >= 3.0.10",
Expand Down

0 comments on commit 4322750

Please sign in to comment.