From e3ee60b07da71f1e1b844719d027551791f70601 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Thu, 17 Oct 2024 15:46:22 +0200
Subject: [PATCH] feat: add option to filter protocol files (#56)

---
 doc/source/changelog.rst        |  6 ++++
 pyannote/pipeline/experiment.py | 59 ++++++++++++++++++++++++++++-----
 2 files changed, 56 insertions(+), 9 deletions(-)
diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
index c645ccf..4b94d4f 100644
--- a/doc/source/changelog.rst
+++ b/doc/source/changelog.rst
@@ -2,6 +2,12 @@
 Changelog
 #########
 
+develop
+~~~~~~~~
+
+- feat: add "--use-filter" option to filter training/validation files 
+
+
 Version 3.0.1 (2023-09-22)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/pyannote/pipeline/experiment.py b/pyannote/pipeline/experiment.py
index 8190c30..03ae43d 100644
--- a/pyannote/pipeline/experiment.py
+++ b/pyannote/pipeline/experiment.py
@@ -42,7 +42,7 @@
                              [default: ~/.pyannote/db.yml]
   --subset=<subset>          Set subset. Defaults to 'development' in "train"
                              mode, and to 'test' in "apply" mode.
-  
+
 "train" mode:
   <experiment_dir>           Set experiment root directory. This script expects
                              a configuration file called "config.yml" to live
@@ -64,6 +64,8 @@
   <train_dir>                Path to the directory containing trained hyper-
                              parameters (i.e. the output of "train" mode).
 
+  --use-filter               Apply pipeline only to files that pass the filter.
+
 Configuration file:
     The configuration of each experiment is described in a file called
     <experiment_dir>/config.yml that describes the pipeline.
@@ -83,6 +85,12 @@
        audio: ~/.pyannote/db.yml   # load template from YAML file
        video: ~/videos/{uri}.mp4   # define template directly
 
+    # filters can be used to filter out some files from the protocol
+    # (e.g. to only keep files with a specific number of speakers)
+    filters:
+        pyannote.audio.utils.protocol.FilterByNumberOfSpeakers:
+            num_speakers: 2
+
     # one can freeze some hyper-parameters if needed (e.g. when
     # only part of the pipeline needs to be updated)
     freeze:
@@ -90,7 +98,7 @@
           speech_activity_detection:
               onset: 0.5
               offset: 0.5
-    
+
     # pyannote.audio pipelines will run on CPU by default.
     # use `device` key to send it to GPU.
     device: cuda
@@ -205,6 +213,17 @@ def __init__(self, experiment_dir: Path, training: bool = False):
 
         self.preprocessors_ = preprocessors
 
+        # initialize filters
+        filters = []
+        for key, params in self.config_.get("filters", {}).items():
+            Klass = get_class_by_name(key)
+            filters.append(Klass(**params))
+
+        def all_filters(i) -> bool:
+            return all(f(i) for f in filters)
+
+        self.filters_ = all_filters
+
         # initialize pipeline
         pipeline_name = self.config_["pipeline"]["name"]
         Klass = get_class_by_name(
@@ -295,7 +314,8 @@ def train(
         else:
             warm_start = None
 
-        inputs = list(getattr(protocol, subset)())
+        inputs = list(filter(self.filters_, getattr(protocol, subset)()))
+
         iterations = optimizer.tune_iter(
             inputs, warm_start=warm_start, show_progress=True
         )
@@ -359,7 +379,11 @@ def best(self, protocol_name: str, subset: str = "development"):
         print(content)
 
     def apply(
-        self, protocol_name: str, output_dir: Path, subset: Optional[str] = "test"
+        self,
+        protocol_name: str,
+        output_dir: Path,
+        subset: Optional[str] = "test",
+        use_filter: bool = False,
     ):
         """Apply current best pipeline
 
@@ -383,11 +407,20 @@ def apply(
             metric = None
 
         output_dir.mkdir(parents=True, exist_ok=True)
-        output_ext = (
-            output_dir / f"{protocol_name}.{subset}.{self.pipeline_.write_format}"
-        )
+        if use_filter:
+            output_ext = (
+                output_dir
+                / f"{protocol_name}.{subset}_INCOMPLETE.{self.pipeline_.write_format}"
+            )
+        else:
+            output_ext = (
+                output_dir / f"{protocol_name}.{subset}.{self.pipeline_.write_format}"
+            )
+
         with open(output_ext, mode="w") as fp:
             files = list(getattr(protocol, subset)())
+            if use_filter:
+                files = filter(self.filters_, files)
 
             desc = f"Processing {protocol_name} ({subset})"
             for current_file in tqdm(iterable=files, desc=desc, unit="file"):
@@ -422,7 +455,11 @@ def apply(
             print(msg)
             return
 
-        output_eval = output_dir / f"{protocol_name}.{subset}.eval"
+        if use_filter:
+            output_eval = output_dir / f"{protocol_name}.{subset}_INCOMPLETE.eval"
+        else:
+            output_eval = output_dir / f"{protocol_name}.{subset}.eval"
+
         with open(output_eval, "w") as fp:
             fp.write(str(metric))
 
@@ -482,6 +519,8 @@ def main():
         if subset is None:
             subset = "test"
 
+        use_filter = arguments["--use-filter"]
+
         train_dir = Path(arguments["<train_dir>"])
         train_dir = train_dir.expanduser().resolve(strict=True)
         experiment = Experiment.from_train_dir(train_dir, training=False)
@@ -492,4 +531,6 @@ def main():
             )
         )
 
-        experiment.apply(protocol_name, output_dir, subset=subset)
+        experiment.apply(
+            protocol_name, output_dir, subset=subset, use_filter=use_filter
+        )