Merge branch 'release/3.0.0'

pyannote · Sep 22, 2023 · 4322750 · 4322750
2 parents 8fbf376 + 541caf4
commit 4322750
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 32 deletions.
diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
@@ -2,6 +2,17 @@
 Changelog
 #########
 
+Version 3.0.0 (2023-09-22)
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- BREAKING(cli): switch to latest pyannote.database API
+- feat: add "seed" parameter for reproducible optimization
+- feat(cli): add "device" section in configuration file
+- feat(cli): add "--registry" option for custom database loading
+- feat(cli): add "--average-case" option to optimize for average case
+- setup: switch to optuna 3.1+
+- feat: add support for optuna Journal storage
+
 Version 2.3 (2022-06-16)
 ~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/pyannote/pipeline/experiment.py b/pyannote/pipeline/experiment.py
@@ -3,7 +3,7 @@
 
 # The MIT License (MIT)
 
-# Copyright (c) 2018-2020 CNRS
+# Copyright (c) 2018- CNRS
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -38,10 +38,11 @@
 
 Common options:
   <database.task.protocol>   Experimental protocol (e.g. "Etape.SpeakerDiarization.TV")
-  --database=<db.yml>        Path to database configuration file.
+  --registry=<db.yml>        Path to, comma-separated, database configuration files.
                              [default: ~/.pyannote/db.yml]
   --subset=<subset>          Set subset. Defaults to 'development' in "train"
                              mode, and to 'test' in "apply" mode.
+  
 "train" mode:
   <experiment_dir>           Set experiment root directory. This script expects
                              a configuration file called "config.yml" to live
@@ -57,6 +58,7 @@
                              bootstrap the optimization process. In practice,
                              this will simply run a first trial with this set
                              of parameters.
+  --average-case             Optimize for average case instead of worst case.
 
 "apply" mode:
   <train_dir>                Path to the directory containing trained hyper-
@@ -88,6 +90,10 @@
           speech_activity_detection:
               onset: 0.5
               offset: 0.5
+    
+    # pyannote.audio pipelines will run on CPU by default.
+    # use `device` key to send it to GPU.
+    device: cuda
     ...................................................................
 
 "train" mode:
@@ -115,7 +121,7 @@
 from datetime import datetime
 
 from pyannote.database import FileFinder
-from pyannote.database import get_protocol
+from pyannote.database import registry
 from pyannote.database import get_annotated
 
 from pyannote.core.utils.helper import get_class_by_name
@@ -161,7 +167,6 @@ def from_train_dir(cls, train_dir: Path, training: bool = False) -> "Experiment"
         return xp
 
     def __init__(self, experiment_dir: Path, training: bool = False):
-
         super().__init__()
 
         self.experiment_dir = experiment_dir
@@ -174,7 +179,6 @@ def __init__(self, experiment_dir: Path, training: bool = False):
         # initialize preprocessors
         preprocessors = {}
         for key, preprocessor in self.config_.get("preprocessors", {}).items():
-
             # preprocessors:
             #    key:
             #       name: package.module.ClassName
@@ -208,11 +212,18 @@ def __init__(self, experiment_dir: Path, training: bool = False):
         )
         self.pipeline_ = Klass(**self.config_["pipeline"].get("params", {}))
 
-        # freeze  parameters
+        # freeze parameters
         if "freeze" in self.config_:
             params = self.config_["freeze"]
             self.pipeline_.freeze(params)
 
+        # send to device
+        if "device" in self.config_:
+            import torch
+
+            device = torch.device(self.config_["device"])
+            self.pipeline_.to(device)
+
     def train(
         self,
         protocol_name: str,
@@ -221,6 +232,7 @@ def train(
         n_iterations: int = 1,
         sampler: Optional[str] = None,
         pruner: Optional[str] = None,
+        average_case: bool = False,
     ):
         """Train pipeline
 
@@ -240,6 +252,8 @@ def train(
             Choose sampler between RandomSampler and TPESampler
         pruner : `str`, optional
             Choose between MedianPruner or SuccessiveHalvingPruner.
+        average_case : `bool`, optional
+            Optimise for average case. Defaults to False (i.e. worst case).
         """
         train_dir = Path(
             self.TRAIN_DIR.format(
@@ -250,15 +264,18 @@ def train(
         )
         train_dir.mkdir(parents=True, exist_ok=True)
 
-        protocol = get_protocol(protocol_name, preprocessors=self.preprocessors_)
+        protocol = registry.get_protocol(
+            protocol_name, preprocessors=self.preprocessors_
+        )
 
         study_name = "default"
         optimizer = Optimizer(
             self.pipeline_,
-            db=train_dir / "iterations.db",
+            db=train_dir / "trials.journal",
             study_name=study_name,
             sampler=sampler,
             pruner=pruner,
+            average_case=average_case,
         )
 
         direction = 1 if self.pipeline_.get_direction() == "minimize" else -1
@@ -290,7 +307,6 @@ def train(
         count = itertools.count() if n_iterations < 0 else range(n_iterations)
 
         for i, status in zip(count, iterations):
-
             loss = status["loss"]
 
             if direction * loss < direction * best_loss:
@@ -326,7 +342,7 @@ def best(self, protocol_name: str, subset: str = "development"):
 
         study_name = "default"
         optimizer = Optimizer(
-            self.pipeline_, db=train_dir / "iterations.db", study_name=study_name
+            self.pipeline_, db=train_dir / "trials.journal", study_name=study_name
         )
 
         try:
@@ -356,7 +372,9 @@ def apply(
         """
 
         # file generator
-        protocol = get_protocol(protocol_name, preprocessors=self.preprocessors_)
+        protocol = registry.get_protocol(
+            protocol_name, preprocessors=self.preprocessors_
+        )
 
         # load pipeline metric (when available)
         try:
@@ -369,12 +387,10 @@ def apply(
             output_dir / f"{protocol_name}.{subset}.{self.pipeline_.write_format}"
         )
         with open(output_ext, mode="w") as fp:
-
             files = list(getattr(protocol, subset)())
 
             desc = f"Processing {protocol_name} ({subset})"
             for current_file in tqdm(iterable=files, desc=desc, unit="file"):
-
                 # apply pipeline and dump output to file
                 output = self.pipeline_(current_file)
                 self.pipeline_.write(fp, output)
@@ -412,14 +428,15 @@ def apply(
 
 
 def main():
-
     arguments = docopt(__doc__, version="Tunable pipelines")
 
+    for database_yml in arguments["--registry"].split(","):
+        registry.load_database(database_yml)
+
     protocol_name = arguments["<database.task.protocol>"]
     subset = arguments["--subset"]
 
     if arguments["train"]:
-
         if subset is None:
             subset = "development"
 
@@ -435,6 +452,8 @@ def main():
         if pretrained:
             pretrained = Path(pretrained).expanduser().resolve(strict=True)
 
+        average_case = arguments["--average-case"]
+
         experiment_dir = Path(arguments["<experiment_dir>"])
         experiment_dir = experiment_dir.expanduser().resolve(strict=True)
 
@@ -446,10 +465,10 @@ def main():
             pretrained=pretrained,
             sampler=sampler,
             pruner=pruner,
+            average_case=average_case,
         )
 
     if arguments["best"]:
-
         if subset is None:
             subset = "development"
 
@@ -460,7 +479,6 @@ def main():
         experiment.best(protocol_name, subset=subset)
 
     if arguments["apply"]:
-
         if subset is None:
             subset = "test"
 

diff --git a/pyannote/pipeline/optimizer.py b/pyannote/pipeline/optimizer.py
@@ -40,8 +40,9 @@
 from optuna.pruners import BasePruner
 from optuna.samplers import BaseSampler, TPESampler
 from optuna.trial import Trial, FixedTrial
+from optuna.storages import RDBStorage, JournalStorage, JournalFileStorage
 from tqdm import tqdm
-from scipy.stats import bayes_mvs
+from optuna.storages import RDBStorage, JournalStorage, JournalFileStorage
 
 from .pipeline import Pipeline
 from .typing import PipelineInput
@@ -57,7 +58,9 @@ class Optimizer:
     pipeline : `Pipeline`
         Pipeline.
     db : `Path`, optional
-        Path to iteration database on disk.
+        Path to trial database on disk. Use ".sqlite" extension for SQLite
+        backend, and ".journal" for Journal backend (prefered for parallel
+        optimization).
     study_name : `str`, optional
         Name of study. In case it already exists, study will continue from
         there. # TODO -- generate this automatically
@@ -68,6 +71,11 @@ class Optimizer:
         Algorithm for early pruning of trials. Must be one of "MedianPruner" or
         "SuccessiveHalvingPruner", or a pruner instance.
         Defaults to no pruning.
+    seed : `int`, optional
+        Seed value for the random number generator of the sampler.
+        Defaults to no seed.
+    average_case : `bool`, optional
+        Optimise for average case. Defaults to False (i.e. worst case).
     """
 
     def __init__(
@@ -77,27 +85,37 @@ def __init__(
         study_name: Optional[str] = None,
         sampler: Optional[Union[str, BaseSampler]] = None,
         pruner: Optional[Union[str, BasePruner]] = None,
+        seed: Optional[int] = None,
+        average_case: bool = True,
     ):
-
         self.pipeline = pipeline
 
         self.db = db
         if db is None:
             self.storage_ = None
         else:
-            self.storage_ = f"sqlite:///{self.db}"
+            extension = Path(self.db).suffix
+            if extension == ".db":
+                warnings.warn(
+                    "Storage with '.db' extension has been deprecated. Use '.sqlite' instead."
+                )
+                self.storage_ = RDBStorage(f"sqlite:///{self.db}")
+            elif extension == ".sqlite":
+                self.storage_ = RDBStorage(f"sqlite:///{self.db}")
+            elif extension == ".journal":
+                self.storage_ = JournalStorage(JournalFileStorage(f"{self.db}"))
         self.study_name = study_name
 
         if isinstance(sampler, BaseSampler):
             self.sampler = sampler
         elif isinstance(sampler, str):
             try:
-                self.sampler = getattr(optuna.samplers, sampler)()
+                self.sampler = getattr(optuna.samplers, sampler)(seed=seed)
             except AttributeError as e:
                 msg = '`sampler` must be one of "RandomSampler" or "TPESampler"'
                 raise ValueError(msg)
         elif sampler is None:
-            self.sampler = TPESampler()
+            self.sampler = TPESampler(seed=seed)
 
         if isinstance(pruner, BasePruner):
             self.pruner = pruner
@@ -123,6 +141,8 @@ def __init__(
             direction=self.pipeline.get_direction(),
         )
 
+        self.average_case = average_case
+
     @property
     def best_loss(self) -> float:
         """Return best loss so far"""
@@ -140,7 +160,9 @@ def best_pipeline(self) -> Pipeline:
         return self.pipeline.instantiate(self.best_params)
 
     def get_objective(
-        self, inputs: Iterable[PipelineInput], show_progress: Union[bool, Dict] = False,
+        self,
+        inputs: Iterable[PipelineInput],
+        show_progress: Union[bool, Dict] = False,
     ) -> Callable[[Trial], float]:
         """
         Create objective function used by optuna
@@ -199,7 +221,6 @@ def objective(trial: Trial) -> float:
 
             # accumulate loss for each input
             for i, input in enumerate(inputs):
-
                 # process input with pipeline
                 # (and keep track of processing time)
                 before_processing = time.time()
@@ -245,14 +266,24 @@ def objective(trial: Trial) -> float:
                 if len(np.unique(losses)) == 1:
                     mean = lower_bound = upper_bound = losses[0]
                 else:
-                    (mean, (lower_bound, upper_bound)), _, _ = bayes_mvs(losses, alpha=0.9)
+                    (mean, (lower_bound, upper_bound)), _, _ = bayes_mvs(
+                        losses, alpha=0.9
+                    )
             else:
                 mean, (lower_bound, upper_bound) = metric.confidence_interval(alpha=0.9)
 
-            if self.pipeline.get_direction() == "minimize":
-                return upper_bound
-            else:
-                return lower_bound
+            if self.average_case:
+                if metric is None:
+                    return mean
+
+                else:
+                    return abs(metric)
+
+            return (
+                upper_bound
+                if self.pipeline.get_direction() == "minimize"
+                else lower_bound
+            )
 
         return objective
 
@@ -336,7 +367,6 @@ def tune_iter(
                 self.study_.enqueue_trial(flattened_params)
 
         while True:
-
             # pipeline is currently being optimized
             self.pipeline.training = True
 

diff --git a/pyannote/pipeline/parameter.py b/pyannote/pipeline/parameter.py
@@ -202,3 +202,6 @@ def __iter__(self):
 
     def __getitem__(self, param_name):
         return getattr(self, param_name)
+
+    def __getattr__(self, param_name):
+        return self.__getitem__(param_name)
diff --git a/setup.py b/setup.py
@@ -39,7 +39,7 @@
         "pyannote.core >= 4.0",
         "pyannote.database >= 4.0",
         "PyYAML >= 3.12",
-        "optuna >= 1.4",
+        "optuna >= 3.1",
         "tqdm >= 4.29.1",
         "docopt >= 0.6.2",
         "filelock >= 3.0.10",