From 1043e90942eefd340916facd54393f8cb57e2da8 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 16 Nov 2023 14:24:23 +0100
Subject: [PATCH 01/33] Fix incomplete variable name refactoring - Completed a
 variable name refactoring (from vec_level to emb_level) across multiple
 files.

---
 .../speaker_embeddings/anonymization/gan_anon.py     |  2 +-
 .../speaker_embeddings/anonymization/pool_anon.py    |  4 ++--
 .../speaker_embeddings/speaker_anonymization.py      |  2 +-
 .../modules/speaker_embeddings/speaker_embeddings.py |  8 ++++----
 .../modules/speaker_embeddings/speaker_extraction.py | 12 ++++++------
 anonymization/pipelines/sttts_pipeline.py            |  2 +-
 configs/anon_ims_sttts_pc.yaml                       |  4 ++--
 evaluation/privacy/asv/asv.py                        |  8 ++++----
 evaluation/utility/voice_distinctiveness/deid_gvd.py |  2 +-
 9 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
index cc2b7fd..6066574 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
@@ -50,7 +50,7 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
             anon_vectors.append(anon_vec)
             genders.append(gender)
 
-        anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, vec_level=emb_level)
+        anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, emb_level=emb_level)
         anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0),
                                     speakers=speakers, genders=genders)
         if self.save_intermediate:
diff --git a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
index bcfdb7a..6ef578e 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
@@ -58,7 +58,7 @@ def __init__(self, vec_type='xvector', device=None, model_name=None, pool_data_d
     def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_dir):
         print(pool_data_dir)
         if pool_vec_path.exists():
-            pool_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level='spk', device=self.device)
+            pool_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='spk', device=self.device)
             pool_embeddings.load_vectors(pool_vec_path)
         else:
             extraction_settings = {'vec_type': self.vec_type, 'emb_level': 'spk'}
@@ -90,7 +90,7 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
             anon_vectors.append(anon_vec)
             genders.append(gender if not self.cross_gender else REVERSED_GENDERS[gender])
 
-        anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, vec_level=emb_level)
+        anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, emb_level=emb_level)
         anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0),
                                     speakers=speakers, genders=genders)
 
diff --git a/anonymization/modules/speaker_embeddings/speaker_anonymization.py b/anonymization/modules/speaker_embeddings/speaker_anonymization.py
index 4374236..929a55d 100644
--- a/anonymization/modules/speaker_embeddings/speaker_anonymization.py
+++ b/anonymization/modules/speaker_embeddings/speaker_anonymization.py
@@ -36,7 +36,7 @@ def anonymize_embeddings(self, speaker_embeddings, dataset_name):
             # simply load them
             print('No computation of anonymized embeddings necessary; load existing anonymized speaker embeddings '
                   'instead...')
-            anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level=self.emb_level, device=self.device)
+            anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level=self.emb_level, device=self.device)
             anon_embeddings.load_vectors(dataset_results_dir)
             return anon_embeddings
         else:
diff --git a/anonymization/modules/speaker_embeddings/speaker_embeddings.py b/anonymization/modules/speaker_embeddings/speaker_embeddings.py
index 5697a73..f0f57c6 100644
--- a/anonymization/modules/speaker_embeddings/speaker_embeddings.py
+++ b/anonymization/modules/speaker_embeddings/speaker_embeddings.py
@@ -7,9 +7,9 @@
 
 class SpeakerEmbeddings:
 
-    def __init__(self, vec_type='xvector', vec_level='spk', device=torch.device('cpu')):
+    def __init__(self, vec_type='xvector', emb_level='spk', device=torch.device('cpu')):
         self.vec_type = vec_type
-        self.vec_level = vec_level
+        self.emb_level = emb_level
         self.device = device
 
         self.identifiers2idx = {}
@@ -120,7 +120,7 @@ def get_spk2gender(self):
         return {speaker: gender for speaker, gender in zip(self.original_speakers, self.genders)}
 
     def convert_to_spk_level(self, method='average'):
-        assert self.vec_level == 'utt', \
+        assert self.emb_level == 'utt', \
             'Speaker embeddings must be on utterance level to be able to convert them to speaker level!'
 
         if method == 'average':
@@ -128,7 +128,7 @@ def convert_to_spk_level(self, method='average'):
             for i, speaker in enumerate(self.original_speakers):
                 spk2idx[speaker].append(i)
 
-            spk_level_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level='spk', device=self.device)
+            spk_level_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='spk', device=self.device)
             spk_vectors, speakers, genders = [], [], []
             if not isinstance(self.vectors, torch.Tensor):
                 self.vectors = torch.tensor(self.vectors)
diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py
index 9fcbf87..171d43b 100644
--- a/anonymization/modules/speaker_embeddings/speaker_extraction.py
+++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py
@@ -26,7 +26,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode
         self.force_compute = force_compute if force_compute else settings.get('force_compute_extraction', False)
 
         self.vec_type = settings['vec_type']
-        self.vec_level = settings['vec_level']
+        self.emb_level = settings['emb_level']
 
         if results_dir:
             self.results_dir = results_dir
@@ -40,7 +40,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode
 
         self.model_hparams = {
             'vec_type': self.vec_type,
-            'model_path': settings.get('vec_model_path') or model_dir
+            'model_path': settings.get('embed_model_path')
         }
 
         if self.n_processes > 1:
@@ -48,15 +48,15 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode
         else:
             self.extractors = create_extractors(hparams=self.model_hparams, device=self.devices[0])
 
-    def extract_speakers(self, dataset_path, dataset_name=None, vec_level=None):
+    def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None):
         dataset_name = dataset_name if dataset_name is not None else dataset_path.name
         dataset_results_dir = self.results_dir / dataset_name if self.save_intermediate else Path('')
         utt2spk = read_kaldi_format(dataset_path / 'utt2spk')
         wav_scp = read_kaldi_format(dataset_path / 'wav.scp')
         spk2gender = read_kaldi_format(dataset_path / 'spk2gender')
-        vec_level = vec_level if vec_level is not None else self.vec_level
+        emb_level = emb_level if emb_level is not None else self.emb_level
 
-        speaker_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level='utt', device=self.devices[0])
+        speaker_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='utt', device=self.devices[0])
 
         if (dataset_results_dir / 'speaker_vectors.pt').exists() and not self.force_compute:
             print('No speaker extraction necessary; load existing embeddings instead...')
@@ -86,7 +86,7 @@ def extract_speakers(self, dataset_path, dataset_name=None, vec_level=None):
 
             speaker_embeddings.set_vectors(vectors=vectors, identifiers=utts, speakers=speakers, genders=genders)
 
-            if vec_level == 'spk':
+            if emb_level == 'spk':
                 speaker_embeddings = speaker_embeddings.convert_to_spk_level()
             if self.save_intermediate:
                 speaker_embeddings.save_vectors(dataset_results_dir)
diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py
index 7a6186f..a0fa36d 100644
--- a/anonymization/pipelines/sttts_pipeline.py
+++ b/anonymization/pipelines/sttts_pipeline.py
@@ -92,7 +92,7 @@ def run_anonymization_pipeline(self, datasets, prepare_results=True):
             # Step 3: Synthesize
             wav_scp = self.speech_synthesis.synthesize_speech(dataset_name=dataset_name, texts=texts,
                                                               speaker_embeddings=anon_embeddings,
-                                                              prosody=anon_prosody, emb_level=anon_embeddings.vec_level)
+                                                              prosody=anon_prosody, emb_level=anon_embeddings.emb_level)
             anon_wav_scps[dataset_name] = wav_scp
             print('Done')
 
diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml
index 3eaf4a9..4b0f4fa 100644
--- a/configs/anon_ims_sttts_pc.yaml
+++ b/configs/anon_ims_sttts_pc.yaml
@@ -50,8 +50,8 @@ modules:
     emb_level: spk   # possible: spk, utt
     anon_method: gan   # possible: pool, random
     anon_name: gan_style-embed
-    extraction_results_path: !ref <results_dir>/original_speaker_embeddings/<modules[speaker_embeddings][vec_type]>_<modules[speaker_embeddings][vec_level]>-level
-    anon_results_path: !ref <results_dir>/anon_speaker_embeddings/<modules[speaker_embeddings][vec_type]>_<modules[speaker_embeddings][vec_level]>-level
+    extraction_results_path: !ref <results_dir>/original_speaker_embeddings/<modules[speaker_embeddings][vec_type]>_<modules[speaker_embeddings][emb_level]>-level
+    anon_results_path: !ref <results_dir>/anon_speaker_embeddings/<modules[speaker_embeddings][vec_type]>_<modules[speaker_embeddings][emb_level]>-level
 
     # pool_anon_settings are only used if anon_method == pool
     pool_anon_settings:
diff --git a/evaluation/privacy/asv/asv.py b/evaluation/privacy/asv/asv.py
index 345923f..aea99ef 100644
--- a/evaluation/privacy/asv/asv.py
+++ b/evaluation/privacy/asv/asv.py
@@ -33,7 +33,7 @@ def __init__(self, model_dir, device, score_save_dir, distance='plda', plda_sett
 
         self.extractor = SpeakerExtraction(results_dir=self.score_save_dir / 'emb_xvect',
                                            model_dir=model_dir, devices=[self.device],
-                                           settings={'vec_type': vec_type, 'vec_level': 'utt'})
+                                           settings={'vec_type': vec_type, 'emb_level': 'utt'})
 
     def compute_trial_scores(self, trials, enrol_indices, test_indices, out_file, sim_scores):
         scores = []
@@ -85,8 +85,8 @@ def change_id_format(data_dict):
     def eer_compute(self, enrol_dir, test_dir, trial_runs_file):
         # Compute all enrol(spk level) and Test(utt level) embeddings
         # enroll vectors are the speaker-level average vectors
-        enrol_all_dict = self.extractor.extract_speakers(dataset_path=Path(enrol_dir), vec_level='spk')
-        test_all_dict = self.extractor.extract_speakers(dataset_path=Path(test_dir), vec_level='utt')
+        enrol_all_dict = self.extractor.extract_speakers(dataset_path=Path(enrol_dir), emb_level='spk')
+        test_all_dict = self.extractor.extract_speakers(dataset_path=Path(test_dir), emb_level='utt')
 
         enrol_vectors = []
         enrol_ids = []
@@ -148,7 +148,7 @@ def compute_distances(self, enrol_vectors, enrol_ids, test_vectors, test_ids):
                                               out_dir=plda_data_dir)
                 print(f'Using data under {plda_data_dir}')
 
-                train_dict = self.extractor.extract_speakers(dataset_path=plda_data_dir, vec_level='utt')
+                train_dict = self.extractor.extract_speakers(dataset_path=plda_data_dir, emb_level='utt')
                 self.plda = PLDAModel(train_embeddings=train_dict, results_path=self.plda_model_dir)
 
             plda_score_object = self.plda.compute_distance(enrollment_vectors=enrol_vectors, enrollment_ids=enrol_ids,
diff --git a/evaluation/utility/voice_distinctiveness/deid_gvd.py b/evaluation/utility/voice_distinctiveness/deid_gvd.py
index 1f47848..558eeb7 100644
--- a/evaluation/utility/voice_distinctiveness/deid_gvd.py
+++ b/evaluation/utility/voice_distinctiveness/deid_gvd.py
@@ -19,7 +19,7 @@ def __init__(self, spk_ext_model_dir, device, score_save_dir, plda_settings=None
         self.num_per_spk = num_per_spk
 
         self.extractor = SpeakerExtraction(results_dir=score_save_dir / 'emb_xvect', model_dir=spk_ext_model_dir,
-                                           devices=[device], settings={'vec_type': vec_type, 'vec_level': 'utt'})
+                                           devices=[device], settings={'vec_type': vec_type, 'emb_level': 'utt'})
 
         self.asv = ASV(model_dir=spk_ext_model_dir, device=device, score_save_dir=score_save_dir, distance=distance,
                        plda_settings=plda_settings, vec_type=vec_type)

From 24ff004e1594c7ea938baf1cce8477812f4e3541 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 21 Nov 2023 18:57:18 +0100
Subject: [PATCH 02/33] Fix bug: loading of 'spk' level embeddings into 'utt'
 level SpeakerEmbeddings object

---
 anonymization/modules/speaker_embeddings/speaker_extraction.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py
index 171d43b..2fca0a5 100644
--- a/anonymization/modules/speaker_embeddings/speaker_extraction.py
+++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py
@@ -61,6 +61,8 @@ def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None):
         if (dataset_results_dir / 'speaker_vectors.pt').exists() and not self.force_compute:
             print('No speaker extraction necessary; load existing embeddings instead...')
             speaker_embeddings.load_vectors(dataset_results_dir)
+            # assume the loaded vectors are computed according to the setting in config
+            speaker_embeddings.emb_level = emb_level
         else:
             print(f'Extract embeddings of {len(wav_scp)} utterances')
             speaker_embeddings.new = True

From 8206441cb3fd824a02215cf65f14b2025bbee369 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 23 Nov 2023 16:12:01 +0100
Subject: [PATCH 03/33] Allow relative paths for `data_dir`

---
 utils/path_management.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/path_management.py b/utils/path_management.py
index b2cdabd..c62cb25 100644
--- a/utils/path_management.py
+++ b/utils/path_management.py
@@ -39,7 +39,7 @@ def scan_checkpoint(cp_dir, prefix):
 
 def get_datasets(config):
     datasets = {}
-    data_dir = config.get('data_dir', None)
+    data_dir = config.get('data_dir', None).expanduser() # if '~' is given in path then manually expand
     for dataset in config['datasets']:
         if data_dir:
             for subset in dataset['enrolls'] + dataset['trials']:

From e7efc5a35a81a9cfd9e2ea2143188fedd82ad0af Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 23 Nov 2023 16:50:30 +0100
Subject: [PATCH 04/33] Dependency injection for anonymizer loading - Now the
 anonymization pipeline is completely separated from the anonymizer object. -
 Specify a BaseAnonymizer subclass in a config, then import that using
 !include syntax. Example is shown for GAN anonymizer. - Add a Passthrough
 anonymizer

---
 .../anonymization/base_anon.py                |  4 ++
 .../anonymization/gan_anon.py                 |  3 +-
 .../anonymization/passthrough.py              | 26 ++++++++
 .../anonymization/random_anon.py              | 16 ++++-
 .../speaker_anonymization.py                  | 38 ++++--------
 .../speaker_embeddings/speaker_extraction.py  |  2 +-
 anonymization/pipelines/sttts_pipeline.py     | 24 +++-----
 configs/anon/ims_gan.yaml                     |  9 +++
 configs/anon/passthrough.yaml                 |  1 +
 configs/anon/pool.yaml                        | 11 ++++
 configs/anon/random.yaml                      |  6 ++
 configs/anon_ims_sttts_pc.yaml                | 59 +++----------------
 configs/datasets_vpc2022_official.yaml        | 20 +++++++
 13 files changed, 123 insertions(+), 96 deletions(-)
 create mode 100644 anonymization/modules/speaker_embeddings/anonymization/passthrough.py
 create mode 100644 configs/anon/ims_gan.yaml
 create mode 100644 configs/anon/passthrough.yaml
 create mode 100644 configs/anon/pool.yaml
 create mode 100644 configs/anon/random.yaml
 create mode 100644 configs/datasets_vpc2022_official.yaml

diff --git a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py
index e9a804a..62fb888 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py
@@ -7,6 +7,7 @@ class BaseAnonymizer:
     def __init__(self, vec_type='xvector', device=None, **kwargs):
         # Base class for speaker embedding anonymization.
         self.vec_type = vec_type
+        self.suffix = '_anon'
 
         if isinstance(device, torch.device):
             self.device = device
@@ -20,3 +21,6 @@ def __init__(self, vec_type='xvector', device=None, **kwargs):
     def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
         # Template method for anonymizing a dataset. Not implemented.
         raise NotImplementedError('anonymize_data')
+    
+    def to(self, device):
+        self.device = device
diff --git a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
index 6066574..f1af431 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 import torch
 import numpy as np
 from scipy.spatial.distance import cosine
@@ -15,7 +16,7 @@ def __init__(self, vec_type='xvector', device=None, model_name=None, vectors_fil
         super().__init__(vec_type=vec_type, device=device)
 
         self.model_name = model_name if model_name else f'gan_{vec_type}'
-        self.vectors_file = vectors_file
+        self.vectors_file = Path(vectors_file)
         self.unused_indices_file = self.vectors_file.with_name(f'unused_indices_{self.vectors_file.name}')
         self.sim_threshold = sim_threshold
         self.save_intermediate = save_intermediate
diff --git a/anonymization/modules/speaker_embeddings/anonymization/passthrough.py b/anonymization/modules/speaker_embeddings/anonymization/passthrough.py
new file mode 100644
index 0000000..742071f
--- /dev/null
+++ b/anonymization/modules/speaker_embeddings/anonymization/passthrough.py
@@ -0,0 +1,26 @@
+from .base_anon import BaseAnonymizer
+import torch
+
+class Passthrough(BaseAnonymizer):
+
+    def __init__(self, vec_type='xvector', device=None, **kwargs):
+        # Base class for speaker embedding anonymization.
+        self.vec_type = vec_type
+        self.suffix = '_res'
+
+        if isinstance(device, torch.device):
+            self.device = device
+        elif isinstance(device, str):
+            self.device = torch.device(device)
+        elif isinstance(device, int):
+            self.device = torch.device(f'cuda:{device}')
+        else:
+            self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+    def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
+        # no need to refer to emb_level, 
+        # as extractor also yields spk-level or utt-level.
+        return speaker_embeddings
+    
+    def to(self, device):
+        self.device = device
diff --git a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py
index 6a0c059..764e60d 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py
@@ -14,11 +14,21 @@ def __init__(self, vec_type='xvector', device=None, model_name=None,  in_scale=F
         super().__init__(vec_type=vec_type, device=device)
 
         self.model_name = model_name if model_name else f'random_{vec_type}'
-
+        
         if in_scale:
-            self.scaling_ranges = self._load_scaling_ranges(stats_per_dim_path=stats_per_dim_path)
+            self.stats_per_dim_path = stats_per_dim_path
         else:
-            self.scaling_ranges = None
+            self.stats_per_dim_path = None
+            self._scaling_ranges = None
+
+    @property
+    def scaling_ranges(self):
+        # defer loading of stats until they are first needed
+        # required after anonymizer initialization is delegated to HyperPyYAML
+        if self.stats_per_dim_path is not None:
+            self._scaling_ranges = self._load_scaling_ranges(stats_per_dim_path=self.stats_per_dim_path)
+            self.stats_per_dim_path = None
+        return self._scaling_ranges
 
     def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
         if self.scaling_ranges:
diff --git a/anonymization/modules/speaker_embeddings/speaker_anonymization.py b/anonymization/modules/speaker_embeddings/speaker_anonymization.py
index 929a55d..5c82113 100644
--- a/anonymization/modules/speaker_embeddings/speaker_anonymization.py
+++ b/anonymization/modules/speaker_embeddings/speaker_anonymization.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from .anonymization import PoolAnonymizer, RandomAnonymizer, GANAnonymizer
+from .anonymization.base_anon import BaseAnonymizer
 from .speaker_embeddings import SpeakerEmbeddings
 
 
@@ -26,6 +26,10 @@ def __init__(self, vectors_dir, device, settings, results_dir=None, save_interme
                 raise ValueError('Results dir must be specified in parameters or settings!')
 
         self.anonymizer = self._load_anonymizer(settings)
+    
+    @property
+    def suffix(self):
+        return self.anonymizer.suffix
 
     def anonymize_embeddings(self, speaker_embeddings, dataset_name):
         dataset_results_dir = self.results_dir / dataset_name if self.save_intermediate else ''
@@ -48,27 +52,11 @@ def anonymize_embeddings(self, speaker_embeddings, dataset_name):
                 anon_embeddings.save_vectors(dataset_results_dir)
             return anon_embeddings
 
-    def _load_anonymizer(self, settings):
-        anon_method = settings['anon_method']
-        vec_type = settings.get('vec_type', 'xvector')
-        model_name = settings.get('anon_name', None)
-
-        if anon_method == 'random':
-            anon_settings = settings.get('random_anon_settings', {})
-            model = RandomAnonymizer(vec_type=vec_type, device=self.device, model_name=model_name, **anon_settings)
-
-        elif anon_method == 'pool':
-            anon_settings = settings.get('pool_anon_settings', {})
-            model = PoolAnonymizer(vec_type=vec_type, device=self.device, model_name=model_name,
-                                   embed_model_dir=settings.get('embed_model_path', Path()),
-                                   save_intermediate=self.save_intermediate, **anon_settings)
-
-        elif anon_method == 'gan':
-            anon_settings = settings.get('gan_anon_settings', {})
-            model = GANAnonymizer(vec_type=vec_type, device=self.device, model_name=model_name,
-                                  save_intermediate=self.save_intermediate, **anon_settings)
-        else:
-            raise ValueError(f'Unknown anonymization method {anon_method}')
-
-        print(f'Model type of anonymizer: {model_name}')
-        return model
+    def _load_anonymizer(self, settings: dict):
+        anon_method = settings['anon_method'] #HyperPyYAML already does the loading
+        assert isinstance(anon_method, BaseAnonymizer), \
+            'The anonymizer must be an instance of BaseAnonymizer, or a ' \
+            f'subclass of it, but received an instance of {type(anon_method)}'
+            
+        print(f'Model type of anonymizer: {type(anon_method).__name__}')
+        return anon_method
diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py
index 2fca0a5..d986903 100644
--- a/anonymization/modules/speaker_embeddings/speaker_extraction.py
+++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py
@@ -127,7 +127,7 @@ def extraction_job(data):
 
         try:
             spk_embs = [extractor.extract_vector(audio=norm_wave, sr=fs) for extractor in speaker_extractors]
-        except RuntimeError:
+        except RuntimeError as e:
             print(f'Runtime error: {utt}, {signal.shape}, {norm_wave.shape}')
             continue
 
diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py
index a0fa36d..a7f1a6d 100644
--- a/anonymization/pipelines/sttts_pipeline.py
+++ b/anonymization/pipelines/sttts_pipeline.py
@@ -33,14 +33,12 @@ def __init__(self, config, force_compute_all, devices):
         self.speaker_extraction = SpeakerExtraction(model_dir=model_dir, devices=devices,
                                                     save_intermediate=save_intermediate,
                                                     settings=modules_config['speaker_embeddings'],
-                                                    force_compute=force_compute_all)
-        if 'anonymizer' in modules_config['speaker_embeddings']:
-            self.speaker_anonymization = SpeakerAnonymization(vectors_dir=vectors_dir, device=devices[0],
-                                                              save_intermediate=save_intermediate,
-                                                              settings=modules_config['speaker_embeddings'],
-                                                              force_compute=force_compute_all)
-        else:
-            self.speaker_anonymization = None
+                                                    force_compute=force_compute_all,
+                                                    )
+        self.speaker_anonymization = SpeakerAnonymization(vectors_dir=vectors_dir, device=devices[0],
+                                                            save_intermediate=save_intermediate,
+                                                            settings=modules_config['speaker_embeddings'],
+                                                            force_compute=force_compute_all)
 
         # Prosody component
         if 'prosody' in modules_config:
@@ -78,11 +76,7 @@ def run_anonymization_pipeline(self, datasets, prepare_results=True):
                 prosody = None
 
             # Step 2: Anonymize speaker, change prosody
-            if self.speaker_anonymization:
-                anon_embeddings = self.speaker_anonymization.anonymize_embeddings(speaker_embeddings=spk_embeddings,
-                                                                                  dataset_name=dataset_name)
-            else:
-                anon_embeddings = spk_embeddings
+            anon_embeddings = self.speaker_anonymization.anonymize_embeddings(speaker_embeddings=spk_embeddings,dataset_name=dataset_name)
 
             if self.prosody_anonymization:
                 anon_prosody = self.prosody_anonymization.anonymize_prosody(prosody=prosody)
@@ -99,13 +93,11 @@ def run_anonymization_pipeline(self, datasets, prepare_results=True):
         if prepare_results:
             if self.speaker_anonymization:
                 anon_vectors_path = self.speaker_anonymization.results_dir,
-                anon_suffix = '_anon'
             else:
                 anon_vectors_path = self.speaker_extraction.results_dir
-                anon_suffix = '_res'
             now = datetime.strftime(datetime.today(), '%d-%m-%y_%H:%M')
             prepare_evaluation_data(dataset_dict=datasets, anon_wav_scps=anon_wav_scps,
-                                    anon_vectors_path=anon_vectors_path, anon_suffix=anon_suffix,
+                                    anon_vectors_path=anon_vectors_path, anon_suffix=self.speaker_anonymization.suffix,
                                     output_path=self.results_dir / 'formatted_data' / now)
             save_yaml(self.config, self.results_dir / 'formatted_data' / now / 'config.yaml')
 
diff --git a/configs/anon/ims_gan.yaml b/configs/anon/ims_gan.yaml
new file mode 100644
index 0000000..8e19edc
--- /dev/null
+++ b/configs/anon/ims_gan.yaml
@@ -0,0 +1,9 @@
+!new:anonymization.modules.speaker_embeddings.anonymization.gan_anon.GANAnonymizer
+  models_dir: null # overridden by main .yaml
+  vec_type: null   # overriden by main .yaml
+  save_intermediate: null   # overriden by main .yaml
+  model_name: null   # overriden by constructor
+  vectors_file: !ref <models_dir>/anonymization/style-embed_wgan.pt
+  gan_model_path: !ref <models_dir>/anonymization/gan_<vec_type>/<vec_type>_wgan.pt
+  num_sampled: 5000
+  sim_threshold: 0.7
\ No newline at end of file
diff --git a/configs/anon/passthrough.yaml b/configs/anon/passthrough.yaml
new file mode 100644
index 0000000..36eac4a
--- /dev/null
+++ b/configs/anon/passthrough.yaml
@@ -0,0 +1 @@
+!new:anonymization.modules.speaker_embeddings.anonymization.passthrough.Passthrough
\ No newline at end of file
diff --git a/configs/anon/pool.yaml b/configs/anon/pool.yaml
new file mode 100644
index 0000000..ccdfd8d
--- /dev/null
+++ b/configs/anon/pool.yaml
@@ -0,0 +1,11 @@
+# pool_anon_settings are only used if anon_method == pool
+pool_data_dir: !ref <data_dir>/libritts_train_other_500
+pool_vec_path: !ref <vectors_dir>/style-embed_spk-level/pool_embeddings
+N: 200
+N_star: 100
+distance: plda    # possible: plda, cosine
+plda_dir: !ref <models_dir>/distances/plda/libritts_train_other_500_xvector
+cross_gender: false
+proximity: farthest    # possible: farthest, nearest, center
+scaling: maxmin    # possible: none, maxmin, mean
+stats_per_dim_path: !ref <models_dir>/anonymization/stats_per_dim.json
\ No newline at end of file
diff --git a/configs/anon/random.yaml b/configs/anon/random.yaml
new file mode 100644
index 0000000..c12afe6
--- /dev/null
+++ b/configs/anon/random.yaml
@@ -0,0 +1,6 @@
+!new:anonymization.modules.speaker_embeddings.anonymization.random_anon.RandomAnonymizer
+  models_dir: null #will be overridden by main .yaml
+  vec_type: null   #will be overridden by main .yaml
+  model_name: null #will be overridden by constructor
+  in_scale: false
+  stats_per_dim_path: !ref <models_dir>/anonymization/random_in-scale_<vec_type>/stats_per_dim.json
\ No newline at end of file
diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml
index 4b0f4fa..e135243 100644
--- a/configs/anon_ims_sttts_pc.yaml
+++ b/configs/anon_ims_sttts_pc.yaml
@@ -2,33 +2,15 @@ root_dir : ..
 data_dir: !ref <root_dir>/data  # TODO adjust path
 save_intermediate: true
 save_output: true
-force_compute_all: false
-
-datasets:
-  - name: libri_dev
-    data: libri
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: libri_test
-    data: libri
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f_all, trials_m_all]
-  - name: vctk_test
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f_all, trials_m_all]
 
 results_dir: !ref <root_dir>/results # TODO adjust path
 models_dir:  !ref <root_dir>/models # TODO adjust path
 vectors_dir: !ref <root_dir>/results/original_speaker_embeddings
+
+force_compute_all: false
+save_intermediate: true
+datasets: !include:datasets_vpc2022_official.yaml
+
 pipeline: sttts
 
 modules:
@@ -48,36 +30,13 @@ modules:
     vec_type: style-embed
     embed_model_path: !ref <models_dir>/tts/Embedding/embedding_function.pt
     emb_level: spk   # possible: spk, utt
-    anon_method: gan   # possible: pool, random
-    anon_name: gan_style-embed
+    anon_method: !include:anon/ims_gan.yaml    # possible: pool, random
+      models_dir: !ref <models_dir>
+      save_intermediate: !ref <save_intermediate>
+      vec_type: !ref <modules[speaker_embeddings][vec_type]>
     extraction_results_path: !ref <results_dir>/original_speaker_embeddings/<modules[speaker_embeddings][vec_type]>_<modules[speaker_embeddings][emb_level]>-level
     anon_results_path: !ref <results_dir>/anon_speaker_embeddings/<modules[speaker_embeddings][vec_type]>_<modules[speaker_embeddings][emb_level]>-level
 
-    # pool_anon_settings are only used if anon_method == pool
-    pool_anon_settings:
-      pool_data_dir: !ref <data_dir>/libritts_train_other_500
-      pool_vec_path: !ref <vectors_dir>/style-embed_spk-level/pool_embeddings
-      N: 200
-      N_star: 100
-      distance: plda    # possible: plda, cosine
-      plda_dir: !ref <models_dir>/distances/plda/libritts_train_other_500_xvector
-      cross_gender: false
-      proximity: farthest    # possible: farthest, nearest, center
-      scaling: maxmin    # possible: none, maxmin, mean
-      stats_per_dim_path: !ref <models_dir>/anonymization/stats_per_dim.json
-
-    # random_anon_settings are only used if anon_method == random
-    random_anon_settings:
-      in_scale: true
-      stats_per_dim_path: !ref <models_dir>/anonymization/stats_per_dim.json
-
-    # gan_anon_settings are only used if anon_method == gan
-    gan_anon_settings:
-      vectors_file: !ref <models_dir>/anonymization/style-embed_wgan_generated_vectors.pt
-      gan_model_path: !ref <models_dir>/anonymization/style-embed_wgan.pt
-      num_sampled: 5000
-      sim_threshold: 0.7
-
   prosody:
     extractor_type: ims
     aligner_model_path: !ref <models_dir>/tts/Aligner/aligner.pt
diff --git a/configs/datasets_vpc2022_official.yaml b/configs/datasets_vpc2022_official.yaml
new file mode 100644
index 0000000..bf7e494
--- /dev/null
+++ b/configs/datasets_vpc2022_official.yaml
@@ -0,0 +1,20 @@
+  - name: libri_dev
+    data: libri
+    set: dev
+    enrolls: [enrolls]
+    trials: [trials_f, trials_m]
+  - name: libri_test
+    data: libri
+    set: test
+    enrolls: [enrolls]
+    trials: [trials_f, trials_m]
+  - name: vctk_dev
+    data: vctk
+    set: dev
+    enrolls: [enrolls]
+    trials: [trials_f_all, trials_m_all]
+  - name: vctk_test
+    data: vctk
+    set: test
+    enrolls: [enrolls]
+    trials: [trials_f_all, trials_m_all]
\ No newline at end of file

From 714b754a930fd0c2c3ad53e827c354df69acfca7 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 23 Nov 2023 16:51:05 +0100
Subject: [PATCH 05/33] Add utility to manipulate already existing VPC datasets
 - Can convert the .scp files to absolute or relative.

---
 utils/relative_scp_to_abs.py | 76 ++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 utils/relative_scp_to_abs.py

diff --git a/utils/relative_scp_to_abs.py b/utils/relative_scp_to_abs.py
new file mode 100644
index 0000000..e585870
--- /dev/null
+++ b/utils/relative_scp_to_abs.py
@@ -0,0 +1,76 @@
+from pathlib import Path
+import typer
+from typing import Optional
+from typing_extensions import Annotated
+
+
+def main(
+        vpc_baseline_path: Annotated[Optional[Path], typer.Option()], 
+        to: str = 'absolute'
+    ):
+    """
+        This script adapts evaluation datasets generated by the VoicePrivacy
+        Challenge 2022 framework codebase. The necessary changes are:
+            * Adapting the .scp files such that data paths are not relative to
+            the VPC baseline/ folder anymore but absolute.
+            * The changes can be also reverted with the same script. To do so,
+            just call it with --to=relative
+    """
+    vpc_baseline_path = vpc_baseline_path.expanduser()
+    assert vpc_baseline_path.exists(), \
+        f'The supplied path to VPC framework ({vpc_baseline_path}) does not exist'
+    
+    # determine the transform
+    if to.casefold() == 'absolute':
+        transform = lambda string: f'{vpc_baseline_path / string}'
+    elif to.casefold() == 'relative':
+        transform = lambda string: Path(string).relative_to(vpc_baseline_path)
+
+    # for each dataset load .scp
+    dataset_list = [
+        'vctk_test_trials_m',
+        'vctk_test_trials_m_common',
+        'vctk_test_trials_m_all',
+        'vctk_test_trials_f',
+        'vctk_test_trials_f_common',
+        'vctk_test_trials_f_all',
+        'vctk_test_trials_all',
+        'vctk_test_enrolls',
+        'vctk_dev_trials_m',
+        'vctk_dev_trials_m_common',
+        'vctk_dev_trials_m_all',
+        'vctk_dev_trials_f',
+        'vctk_dev_trials_f_common',
+        'vctk_dev_trials_f_all',
+        'vctk_dev_trials_all',
+        'vctk_dev_enrolls',
+        'libri_test_trials_m',
+        'libri_test_trials_f',
+        'libri_test_trials_all',
+        'libri_test_enrolls',
+        'libri_dev_trials_m',
+        'libri_dev_trials_f',
+        'libri_dev_trials_all',
+        'libri_dev_enrolls',
+    ]
+    for dataset in dataset_list:
+        dataset_path = vpc_baseline_path / 'data' / dataset
+        lines = []
+        with open(dataset_path / 'wav.scp') as scp:
+            for line in scp.readlines():
+                # scp format is {utt} {path}\n
+                items = line.split(' ')
+                new_line = f'{items[0]} {transform(items[1].strip())}\n'
+                lines.append(new_line)
+        # validate
+        for line in lines:
+            line = line.split(' ')[1].strip()
+            if to == 'relative':
+                line = vpc_baseline_path / line
+            assert Path(line).exists(), f'Line {line} has issues, exiting.'
+        with open(dataset_path / 'wav.scp', mode='w') as scp:
+            for line in lines:
+                scp.writelines(line)
+
+if __name__ == '__main__':
+    typer.run(main)
\ No newline at end of file

From fe8cdbd4fee26b94d652da4e4730ef711ae4e9a9 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 23 Nov 2023 16:52:14 +0100
Subject: [PATCH 06/33] Add makefile & environment.yaml for dependency
 tracking. - Makefile installs the environment local to the project folder.

---
 Makefile         | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 environment.yaml | 47 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 Makefile
 create mode 100644 environment.yaml

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..714666d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,53 @@
+###############################
+## CONFIGURATION
+###############################
+PHONY: install uninstall pretrained_models espnet
+.ONESHELL:
+
+PROJECT_NAME = voicepat
+ENV_NAME = $(PROJECT_NAME)_env
+
+ifeq (, $(shell mamba --version))
+CONDA = conda
+else
+CONDA = mamba
+endif
+
+###############################
+##@ INSTALLATION
+###############################
+
+install: $(ENV_NAME) ## performs the installation. Currently the only step is to install the conda environment
+
+espnet: ## installs ESPNet
+	echo Deactivated
+
+uninstall:
+	@rm -rf $(ENV_NAME)
+	@rm -rf ESPNet
+	@rm -rf models/
+
+pretrained_models: ## downloads the pretrained models from IMS repositories
+	@echo Downloading models from IMS repositories
+	@rm -rf models
+	@mkdir -p models
+	@wget -q -O models/anonymization.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/anonymization.zip
+	@wget -q -O models/asr.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/asr.zip
+	@wget -q -O models/tts.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/tts.zip
+	@unzip -oq models/asr.zip -d models
+	@unzip -oq models/tts.zip -d models
+	@unzip -oq models/anonymization.zip -d models
+	@rm models/*.zip
+
+
+$(ENV_NAME): environment.yaml
+	@($(CONDA) env create -f $< -p ./$@ && @echo Installation complete, please run `conda-develop .` once.) || $(CONDA) env update -f $< -p ./$@
+	@conda config --set env_prompt '($$(basename {default_env})) '
+	@(cat .gitignore | grep -q $(ENV_NAME)) || echo $(ENV_NAME) >> .gitignore
+
+###############################
+##@ SELF-DOCUMENTING COMMAND
+###############################
+
+help:  ## Display this help
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
diff --git a/environment.yaml b/environment.yaml
new file mode 100644
index 0000000..058b844
--- /dev/null
+++ b/environment.yaml
@@ -0,0 +1,47 @@
+name: voicepat_env
+channels:
+  - nvidia
+  - pytorch
+  - conda-forge
+  - default
+dependencies:
+  - pip
+  - pip:
+    - speechbrain
+    - noisereduce
+    - pyloudnorm
+    - phonemizer
+    - praat-parselmouth
+    - espnet
+    - espnet_model_zoo
+  - python
+  - urllib3
+  - matplotlib
+  - seaborn
+  - jupyter
+  - jupyterlab
+  - ipywidgets
+  - ipympl
+  - blas=1.0=mkl
+  - numpy
+  - cvxopt
+  - scipy
+  - scikit-learn
+  - pyyaml
+  - click
+  - typer
+  - tqdm
+  - openpyxl
+  - librosa
+  - resampy
+  - python-sounddevice
+  - pytorch::pytorch-cuda  
+  - pytorch::ignite
+  - pytorch::torchaudio
+  - tensorboardx
+  - tensorboard
+  - optuna
+  - hydra-core
+  - typeguard==2.13.3
+  - conda-build
+  - torchvision # only to prevent warnings
\ No newline at end of file

From e83892f073bcfb52936c86ebb3432b67a02a6c15 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 23 Nov 2023 17:09:58 +0100
Subject: [PATCH 07/33] Update "README.md"s

---
 README.md               | 91 ++++++++++++++++++-----------------------
 anonymization/README.md |  5 +++
 2 files changed, 45 insertions(+), 51 deletions(-)

diff --git a/README.md b/README.md
index 7596fdd..0988589 100644
--- a/README.md
+++ b/README.md
@@ -1,88 +1,77 @@
 # [VoicePAT: Voice Privacy Anonymization Toolkit](http://arxiv.org/abs/2309.08049)
 
-**Note: This repository and its documentation are still under construction but can already be used for both 
-anonymization and evaluation. We welcome all contributions to introduce more generation methods or evaluation metrics to the VoicPAT framework. 
-If you are interested in contributing, please leave comments on a GitHub issue.**
-
-VoicePAT is a toolkit for speaker anonymization research, with special focus on speaker anonymization. 
-It is based on the framework(s) by the [VoicePrivacy Challenges](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022) but contains the following improvements:
-
-* It consists of **two separate procedures for anonymization and evaluation**. This means that the generation of 
-  anonymized speech is independent of the evaluation of anonymization systems. Both processes do not need to be 
-  executed in the same run or with the same settings. Of course, you need to perform the anonymization of evaluation 
-  data with one system before you can evaluate it but this could have happened at an earlier time and with an 
-  external codebase.
-* Anonymization and evaluation procedures are **structured as pipelines** consisting of separate **modules**. Each 
-  module may have a selection of different models or algorithm to fulfill its role. The settings for each procedure 
-  / pipeline are defined exclusively in configuration files. See the *Usage* section below for more information.
-* **Evaluation models** have been exchanged by models based on [SpeechBrain](https://github.com/speechbrain/speechbrain/) and [ESPnet](https://github.com/espnet/espnet/) which are **more powerful** than the 
-  previous Kaldi-based models. Furthermore, we added new techniques to make evaluation significantly **more 
-  efficient**.
-* The framework is written in **Python**, making it easy to include and adapt other Python-based models, e.g., using 
-  PyTorch. When using the framework, you do not need in-depth knowledge about anything outside the Python realm 
-  (Disclaimer: While being written in Python, the ASR evaluation is currently included with an ESPnet-based model 
-  which in turn is based on Kaldi. However, you do not need to modify that part of the code for using or 
-  changing the ASR model and ESPnet is currently working on a Kaldi-free version.)
+**Note: This repository and its documentation are still under construction but can already be used for both anonymization and evaluation. We welcome all contributions to introduce more generation methods or evaluation metrics to the VoicePAT framework. If you are interested in contributing, please leave comments on a GitHub issue.**
 
+VoicePAT is a toolkit for speaker anonymization research. It is based on the framework(s) by the [VoicePrivacy Challenges](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022) but contains the following improvements:
+
+* It consists of **two separate procedures for anonymization and evaluation**. This means that the generation of anonymized speech is independent of the evaluation of anonymization systems. Both processes do not need to be executed in the same run or with the same settings. Of course, you need to perform the anonymization of evaluation data with one system before you can evaluate it but this could have happened at an earlier time and with an external codebase.
+* Anonymization and evaluation procedures are **structured as pipelines** consisting of separate **modules**. Each module may have a selection of different models or algorithm to fulfill its role. The settings for each procedure / pipeline are defined exclusively in configuration files. See the *Usage* section below for more information.
+* **Evaluation models** have been exchanged by models based on [SpeechBrain](https://github.com/speechbrain/speechbrain/) and [ESPnet](https://github.com/espnet/espnet/) which are **more powerful** than the previous Kaldi-based models. Furthermore, we added new techniques to make evaluation significantly **more efficient**.
+* The framework is written in **Python**, making it easy to include and adapt other Python-based models, e.g., using PyTorch. When using the framework, you do not need in-depth knowledge about anything outside the Python realm. (Disclaimer: While being written in Python, the ASR evaluation is currently included with an ESPnet-based model which in turn is based on Kaldi. However, you do not need to modify that part of the code for using or changing the ASR model and ESPnet is currently working on a Kaldi-free version.)
 
 ## Installation
-Simply clone the repository and install the dependencies in [requirements.txt](requirements.txt). If you want to use 
-the ESPnet-based ASR evaluation model, you additionally need to clone and install [ESPNet](https://github.com/espnet/espnet/) and insert the link to 
-it in [evaluation/utility/asr/path.sh](evaluation/utility/asr/path.sh), e.g., ``MAIN_ROOT=~/espnet``.
+
+Requires `conda` for environment management. Installation of `mamba` is also recommended for speeding up the environment-related tasks. Simply clone the repository and run the following commands, a conda environment will be generated in the project root folder and the pretrained models will be downloaded.
+
+```bash
+sudo apt install libespeak-ng   # alternatively use your own package manager
+make install pretrained_models  
+```
+
+The datasets have to be downloaded via the VoicePrivacy Challenge framework. Once the download is complete, the `.scp` files need to be converted to the absolute path, because they are relative to the challenge folder. Use [utils/relative_scp_to_abs.py](utils/relative_scp_to_abs.py) for this purpose. Then simply point `data_path` in the YAML configurations to the data folder of the VoicePrivacy Challenge framework.
+
+If you want to use the ESPnet-based ASR evaluation model, you additionally need to clone and install [ESPNet](https://github.com/espnet/espnet/) and insert the link to it in [evaluation/utility/asr/path.sh](evaluation/utility/asr/path.sh), e.g., ``MAIN_ROOT=~/espnet``.
 
 ## Usage
 
 ![](figures/framework.png)
 
-For using the toolkit with the existing methods, you can use the configuration files in [configs](configs). You can 
-also add more modules and models to the code and create your own config by using the existing ones as template.
-
+For using the toolkit with the existing methods, you can use the configuration files in [configs](configs). You can also add more modules and models to the code and create your own config by using the existing ones as template. The configuration files use HyperPyYAML syntax, for which a useful reference is available [here](https://colab.research.google.com/drive/1Pg9by4b6-8QD2iC0U7Ic3Vxq4GEwEdDz?usp=sharing).
 
 ### Anonymization
+
 The framework currently contains only one pipeline and config for anonymization, [anon_ims_sttts_pc.yaml](configs/anon_ims_sttts_pc.yaml). If you are using this config, you need to modify at least the following entries:
-```
-data_dir: path to original data in Kaldi-format for anonymization
-results_dir: path to location for all (intermediate) results of the anonymization
-models_dir:  path to models location
+
+```YAML
+data_dir:    # path to original data in Kaldi-format for anonymization
+results_dir: # path to location for all (intermediate) results of the anonymization
+models_dir:  # path to models location
 ```
 
 Running an anonymization pipeline is done like this:
+
 ```
 python run_anonymization.py --config anon_ims_sttts_pc.yaml --gpu_ids 0,1 --force_compute
 ```
-This will perform all computations that support parallel computing on the gpus with ID 0 and 1, and on GPU 0 
-otherwise. If no gpu_ids are specified, it will run only on GPU 0 or CPU, depending on whether cuda is available. 
-`--force_compute` causes all previous computations to be run again. In most cases, you can delete that flag from the 
-command to speed up the anonymization.
 
-Pretrained models for this anonymization can be found at [https://github.
-com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0](https://github.com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0) and earlier releases.
+This will perform all computations that support parallel computing on the gpus with ID 0 and 1, and on GPU 0 otherwise. If no gpu_ids are specified, it will run only on GPU 0 or CPU, depending on whether cuda is available. `--force_compute` causes all previous computations to be run again. In most cases, you can delete that flag from the command to speed up the anonymization.
+
+Pretrained models for this anonymization can be found at [https://github.com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0](https://github.com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0) and earlier releases.
 
 ### Evaluation
-All other config files in [configs](configs) can be used for evaluation with different settings. In these configs, 
-you need to adapt at least
+
+All other config files in [configs](configs) can be used for evaluation with different settings. In these configs, you need to adapt at least
+
 ```
 eval_data_dir: path to anonymized evaluation data in Kaldi-format
 asr/libri_dir: path to original LibriSpeech dataset
 ```
 
 Running an evaluation pipeline is done like this:
+
 ```
 python run_evaluation.py --config eval_pre_ecapa_cos.yaml --gpu_ids 1,2,3
 ```
-making the GPUs with IDs 1, 2 and 3 available to the process. If no GPU is specified, it will default to CUDA:0 or 
-use all GPUs if 
-cuda is available, or run on CPU otherwise.
 
-Pretrained evaluation models can be found in release v1. 
+making the GPUs with IDs 1, 2 and 3 available to the process. If no GPU is specified, it will default to CUDA:0 or use all GPUs if cuda is available, or run on CPU otherwise.
+
+Pretrained evaluation models can be found in release v1.
 
 ## Acknowledgements
+
 Several parts of this toolkit are based on or use code from external sources, i.e.,
-* [VoicePrivacy Challenge 2022](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022), [ESPnet](https://github.com/espnet/espnet/), [SpeechBrain](https://github.com/speechbrain/speechbrain/) for evaluation
-* the [GAN-based anonymization system by IMS (University of Stuttgart)](https://github.com/DigitalPhonetics/speaker-anonymization) 
-  for 
-  anonymization
 
-See the READMEs for [anonymization](anonymization/README.md) and [evaluation](evaluation/README.md) for more 
-information.
+* [VoicePrivacy Challenge 2022](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022), [ESPnet](https://github.com/espnet/espnet/), [SpeechBrain](https://github.com/speechbrain/speechbrain/) for evaluation
+* the [GAN-based anonymization system by IMS (University of Stuttgart)](https://github.com/DigitalPhonetics/speaker-anonymization) for anonymization
 
+See the READMEs for [anonymization](anonymization/README.md) and [evaluation](evaluation/README.md) for more information.
diff --git a/anonymization/README.md b/anonymization/README.md
index 7006f3c..a8226ff 100644
--- a/anonymization/README.md
+++ b/anonymization/README.md
@@ -4,4 +4,9 @@ The anonymization branch can contain multiple pipelines, modules and models. So
 [Speech-to-Text-to-Speech (STTTS) pipeline](https://ieeexplore.ieee.org/document/10096607), based on this code:  
 [https://github.com/DigitalPhonetics/speaker-anonymization](https://github.com/DigitalPhonetics/speaker-anonymization).
 
+
+# Experiment with different speaker embedding mappings
+
+This is now simplified: you can define your anonymizer (a function that yields a speaker embedding when a speaker embedding is supplied) using the `!new` syntax of HyperPyYAML in a config file (e.g., see [ims_gan.yaml](../configs/anon/ims_gan.yaml)). The only requirement is that your anonymizer must implement the `BaseAnonymizer` API (see [base_anon.py](modules/speaker_embeddings/anonymization/base_anon.py)).
+
 *This documentation is still under construction and will be extended soon.*
\ No newline at end of file

From a20f0532307d4ab7176f2272755a614f51b34519 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Fri, 24 Nov 2023 16:39:34 +0100
Subject: [PATCH 08/33] Temporary fix to #1: unstable prosody inference - each
 libespeak backend instantiation creates a copy of the library. - for some
 reason previous instances (for each utterance) are not garbage collected. -
 latest at 1500 or 2000 times of ProsodyExtraction the anonymization pipeline
 crashes. - As a temporary solution, since ProsodyExtraction does not support
 `n_processes>1`, we can make different instances share the backend, as long
 as specs do not change. This also accelerates the prosody extraction.

A better fix could be performed to also allow parallel execution but I couldn't see how today.
---
 .../IMSToucan/Preprocessing/TextFrontend.py   | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/anonymization/modules/tts/IMSToucan/Preprocessing/TextFrontend.py b/anonymization/modules/tts/IMSToucan/Preprocessing/TextFrontend.py
index db1cb9e..d30563c 100644
--- a/anonymization/modules/tts/IMSToucan/Preprocessing/TextFrontend.py
+++ b/anonymization/modules/tts/IMSToucan/Preprocessing/TextFrontend.py
@@ -13,6 +13,7 @@
 
 
 class ArticulatoryCombinedTextFrontend:
+    _backend = None
 
     def __init__(self,
                  language,
@@ -156,11 +157,23 @@ def __init__(self,
             print("Language not supported yet")
             sys.exit()
 
-        self.phonemizer_backend = EspeakBackend(language=self.g2p_lang,
-                                                punctuation_marks=';:,.!?¡¿—…"«»“”~/。【】、‥،؟“”؛',
-                                                preserve_punctuation=True,
-                                                language_switch='remove-flags',
-                                                with_stress=self.use_stress)
+        # temporary: share the backend if config matches. this prevents multitasking.
+        if ArticulatoryCombinedTextFrontend._backend is None:
+            ArticulatoryCombinedTextFrontend._backend = EspeakBackend(language=self.g2p_lang,
+                                                                    punctuation_marks=';:,.!?¡¿—…"«»“”~/。【】、‥،؟“”؛',
+                                                                    preserve_punctuation=True,
+                                                                    language_switch='remove-flags',
+                                                                    with_stress=self.use_stress,
+                                                                    )
+        elif ArticulatoryCombinedTextFrontend._backend.language != self.g2p_lang or \
+                ArticulatoryCombinedTextFrontend._backend._with_stress != self.use_stress:
+            ArticulatoryCombinedTextFrontend._backend = EspeakBackend(language=self.g2p_lang,
+                                                                    punctuation_marks=';:,.!?¡¿—…"«»“”~/。【】、‥،؟“”؛',
+                                                                    preserve_punctuation=True,
+                                                                    language_switch='remove-flags',
+                                                                    with_stress=self.use_stress,
+                                                                    )
+        self.phonemizer_backend = ArticulatoryCombinedTextFrontend._backend
 
         self.phone_to_vector = generate_feature_table()
         self.phone_to_id = get_phone_to_id()

From d8c2f48b35f24fb3eedcac60f05fa7a966d389bc Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 28 Nov 2023 11:59:38 +0100
Subject: [PATCH 09/33] Fix #2 - filenames given in `hyperparams.yaml`
 interpreted as path

---
 .../privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml      | 4 +---
 evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml     | 4 +---
 .../privacy/asv/asv_train/hparams/xvector/hyperparams.yaml    | 4 +---
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml b/evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml
index 41928df..1b92bb2 100755
--- a/evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml
+++ b/evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml
@@ -48,6 +48,4 @@ modules:
 
 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
     loadables:
-        embedding_model: !ref <embedding_model>
-    paths:
-        embedding_model: !ref embedding_model.ckpt
+        embedding_model: !ref <embedding_model>
\ No newline at end of file
diff --git a/evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml b/evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml
index 41928df..1b92bb2 100755
--- a/evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml
+++ b/evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml
@@ -48,6 +48,4 @@ modules:
 
 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
     loadables:
-        embedding_model: !ref <embedding_model>
-    paths:
-        embedding_model: !ref embedding_model.ckpt
+        embedding_model: !ref <embedding_model>
\ No newline at end of file
diff --git a/evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml b/evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml
index 8cf88fd..f889c2a 100644
--- a/evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml
+++ b/evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml
@@ -51,6 +51,4 @@ modules:
 
 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
     loadables:
-        embedding_model: !ref <embedding_model>
-    paths:
-        embedding_model: !ref embedding_model.ckpt
+        embedding_model: !ref <embedding_model>
\ No newline at end of file

From 109525a384e56226371a8a0e51281740bd028c91 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 30 Nov 2023 12:03:27 +0100
Subject: [PATCH 10/33] Standardize `SpeakerExtraction` config and
 instantiation across anonymization and evaluation pipelines

---
 .../embedding_methods/speechbrain_vectors.py      | 12 ++++++++----
 .../speaker_embeddings/speaker_extraction.py      |  3 ++-
 anonymization/pipelines/sttts_pipeline.py         | 15 ++++++++-------
 evaluation/privacy/asv/asv.py                     |  4 ++--
 .../utility/voice_distinctiveness/deid_gvd.py     |  4 ++--
 5 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py
index 83fd2fd..7c3bb18 100644
--- a/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py
+++ b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py
@@ -11,13 +11,17 @@ class SpeechBrainVectors:
         'ecapa': 'spkrec-ecapa-voxceleb'
     }
 
-    def __init__(self, vec_type, device, model_path=None):
+    def __init__(self, vec_type, device, model_path: Path = None):
         self.device = device
 
         if model_path is not None and model_path.exists():
-            model_path = str(Path(model_path).absolute())
-            self.extractor = EncoderClassifier.from_hparams(source=model_path, savedir=model_path,
-                                                            run_opts={'device': self.device})
+            model_path = Path(model_path).absolute()
+            savedir = model_path.parent
+            self.extractor = EncoderClassifier.from_hparams(
+                    source=str(model_path), 
+                    savedir=str(savedir),
+                    run_opts={'device': self.device}
+                )
         else:
             if model_path is None:
                 model_path = Path('')
diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py
index d986903..5ce6cfb 100644
--- a/anonymization/modules/speaker_embeddings/speaker_extraction.py
+++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py
@@ -25,6 +25,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode
         self.save_intermediate = save_intermediate
         self.force_compute = force_compute if force_compute else settings.get('force_compute_extraction', False)
 
+        self.embed_model_path = settings['embed_model_path']
         self.vec_type = settings['vec_type']
         self.emb_level = settings['emb_level']
 
@@ -40,7 +41,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode
 
         self.model_hparams = {
             'vec_type': self.vec_type,
-            'model_path': settings.get('embed_model_path')
+            'model_path': self.embed_model_path,
         }
 
         if self.n_processes > 1:
diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py
index a7f1a6d..53c542a 100644
--- a/anonymization/pipelines/sttts_pipeline.py
+++ b/anonymization/pipelines/sttts_pipeline.py
@@ -30,25 +30,25 @@ def __init__(self, config, force_compute_all, devices):
                                                     settings=modules_config['asr'], force_compute=force_compute_all)
 
         # Speaker component
-        self.speaker_extraction = SpeakerExtraction(model_dir=model_dir, devices=devices,
+        self.speaker_extraction = SpeakerExtraction(devices=devices,
                                                     save_intermediate=save_intermediate,
                                                     settings=modules_config['speaker_embeddings'],
-                                                    force_compute=force_compute_all,
+                                                    force_compute=force_compute,
                                                     )
         self.speaker_anonymization = SpeakerAnonymization(vectors_dir=vectors_dir, device=devices[0],
                                                             save_intermediate=save_intermediate,
                                                             settings=modules_config['speaker_embeddings'],
-                                                            force_compute=force_compute_all)
+                                                            force_compute=force_compute)
 
         # Prosody component
         if 'prosody' in modules_config:
             self.prosody_extraction = ProsodyExtraction(device=devices[0], save_intermediate=save_intermediate,
                                                         settings=modules_config['prosody'],
-                                                        force_compute=force_compute_all)
+                                                        force_compute=force_compute)
             if 'anonymizer' in modules_config['prosody']:
                 self.prosody_anonymization = ProsodyAnonymization(save_intermediate=save_intermediate,
                                                                   settings=modules_config['prosody'],
-                                                                  force_compute=force_compute_all)
+                                                                  force_compute=force_compute)
             else:
                 self.prosody_anonymization = None
         else:
@@ -57,7 +57,8 @@ def __init__(self, config, force_compute_all, devices):
         # TTS component
         self.speech_synthesis = SpeechSynthesis(devices=[devices[0]], settings=modules_config['tts'],
                                                 model_dir=model_dir, save_output=config.get('save_output', True),
-                                                force_compute=force_compute_all)
+                                                force_compute=force_compute,
+                                                )
 
     def run_anonymization_pipeline(self, datasets, prepare_results=True):
         anon_wav_scps = {}
@@ -92,7 +93,7 @@ def run_anonymization_pipeline(self, datasets, prepare_results=True):
 
         if prepare_results:
             if self.speaker_anonymization:
-                anon_vectors_path = self.speaker_anonymization.results_dir,
+                anon_vectors_path = self.speaker_anonymization.results_dir
             else:
                 anon_vectors_path = self.speaker_extraction.results_dir
             now = datetime.strftime(datetime.today(), '%d-%m-%y_%H:%M')
diff --git a/evaluation/privacy/asv/asv.py b/evaluation/privacy/asv/asv.py
index aea99ef..6792d44 100644
--- a/evaluation/privacy/asv/asv.py
+++ b/evaluation/privacy/asv/asv.py
@@ -32,8 +32,8 @@ def __init__(self, model_dir, device, score_save_dir, distance='plda', plda_sett
             self.plda_anon = None
 
         self.extractor = SpeakerExtraction(results_dir=self.score_save_dir / 'emb_xvect',
-                                           model_dir=model_dir, devices=[self.device],
-                                           settings={'vec_type': vec_type, 'emb_level': 'utt'})
+                                           devices=[self.device],
+                                           settings={'vec_type': vec_type, 'emb_level': 'utt', 'embed_model_path': model_dir})
 
     def compute_trial_scores(self, trials, enrol_indices, test_indices, out_file, sim_scores):
         scores = []
diff --git a/evaluation/utility/voice_distinctiveness/deid_gvd.py b/evaluation/utility/voice_distinctiveness/deid_gvd.py
index 558eeb7..fb2c9df 100644
--- a/evaluation/utility/voice_distinctiveness/deid_gvd.py
+++ b/evaluation/utility/voice_distinctiveness/deid_gvd.py
@@ -18,8 +18,8 @@ def __init__(self, spk_ext_model_dir, device, score_save_dir, plda_settings=None
                  vec_type='xvector', num_per_spk='all'):
         self.num_per_spk = num_per_spk
 
-        self.extractor = SpeakerExtraction(results_dir=score_save_dir / 'emb_xvect', model_dir=spk_ext_model_dir,
-                                           devices=[device], settings={'vec_type': vec_type, 'emb_level': 'utt'})
+        self.extractor = SpeakerExtraction(results_dir=score_save_dir / 'emb_xvect',
+                                           devices=[device], settings={'vec_type': vec_type, 'emb_level': 'utt', 'embed_model_path': spk_ext_model_dir})
 
         self.asv = ASV(model_dir=spk_ext_model_dir, device=device, score_save_dir=score_save_dir, distance=distance,
                        plda_settings=plda_settings, vec_type=vec_type)

From 628e6637011b5e2a4a50e0d970d22187fbd253fd Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 30 Nov 2023 22:18:22 +0100
Subject: [PATCH 11/33] Add "incomplete config exception" to inform users about
 TODO entries - Also fixed a duplicate `save_intermediate` entry in
 `anon_ims_sttts_pc.yaml`

---
 configs/anon_ims_sttts_pc.yaml  |  7 +++----
 configs/eval_pre_ecapa_cos.yaml |  4 ++--
 utils/config_primitives.py      | 10 ++++++++++
 3 files changed, 15 insertions(+), 6 deletions(-)
 create mode 100644 utils/config_primitives.py

diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml
index e135243..ff9effe 100644
--- a/configs/anon_ims_sttts_pc.yaml
+++ b/configs/anon_ims_sttts_pc.yaml
@@ -1,14 +1,13 @@
 root_dir : ..
-data_dir: !ref <root_dir>/data  # TODO adjust path
+data_dir: !new:utils.config_primitives.Todo  # TODO adjust path, e.g. <root_dir>/data
 save_intermediate: true
 save_output: true
 
-results_dir: !ref <root_dir>/results # TODO adjust path
-models_dir:  !ref <root_dir>/models # TODO adjust path
+results_dir: !new:utils.config_primitives.Todo # TODO adjust path, e.g. <root_dir>/results
+models_dir:  !new:utils.config_primitives.Todo  # TODO adjust path, e.g. <root_dir>/models
 vectors_dir: !ref <root_dir>/results/original_speaker_embeddings
 
 force_compute_all: false
-save_intermediate: true
 datasets: !include:datasets_vpc2022_official.yaml
 
 pipeline: sttts
diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml
index 4fa6518..5d11375 100644
--- a/configs/eval_pre_ecapa_cos.yaml
+++ b/configs/eval_pre_ecapa_cos.yaml
@@ -41,7 +41,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: TODO  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: !new:utils.config_primitives.Todo  # TODO path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
@@ -72,7 +72,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: TODO  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: !new:utils.config_primitives.Todo # TODO path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_pre  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/utils/config_primitives.py b/utils/config_primitives.py
new file mode 100644
index 0000000..dc715ed
--- /dev/null
+++ b/utils/config_primitives.py
@@ -0,0 +1,10 @@
+class IncompleteConfigException(Exception):
+    pass
+
+
+class Todo:
+    def __init__(self, *args, **kwargs):
+        raise IncompleteConfigException(
+            'You must complete the config files before execution. ' 
+            'Find lines with "TODO" and enter the appropriate information.'
+        )

From 0ff54677ebcd9c747e69364639b2db81e701042b Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Fri, 1 Dec 2023 11:20:05 +0100
Subject: [PATCH 12/33] Fix evaluation config not pointing to dataset .yaml

---
 configs/eval_pre_ecapa_cos.yaml | 32 +-------------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml
index 5d11375..ca444ab 100644
--- a/configs/eval_pre_ecapa_cos.yaml
+++ b/configs/eval_pre_ecapa_cos.yaml
@@ -1,37 +1,7 @@
 root_dir: .
 exp_dir: !ref <root_dir>/exp
 
-datasets:
-  - name: libri_dev
-    data: libri
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: libri_test
-    data: libri
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev_common
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
-  - name: vctk_test
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_test_common
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
+datasets: !include:datasets_vpc2022_official.yaml
 
 eval_steps:  # all metrics in this list will be computed in the evaluation. Remove entry to skip
   privacy:

From 06c11eca90ee56c8f749ea65d3d6833b89527840 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 11:37:05 +0100
Subject: [PATCH 13/33] Fixes to the anonymizer classes - BaseAnonymizer and
 its descendants are now dumpable - Improved documentation - The configs now
 use !PLACEHOLDER tag from HyperPyYAML instead of the custom class

---
 .../anonymization/base_anon.py                |  58 ++++-
 .../anonymization/gan_anon.py                 |  95 +++++--
 .../anonymization/passthrough.py              |  44 ++--
 .../anonymization/pool_anon.py                | 241 +++++++++++++-----
 .../anonymization/random_anon.py              |  98 +++++--
 configs/anon/ims_gan.yaml                     |   9 +-
 configs/anon/pool.yaml                        |  24 +-
 configs/anon_ims_sttts_pc.yaml                |   6 +-
 configs/eval_gvd_both.yaml                    |   2 +-
 configs/eval_post_ecapa_cos_ft.yaml           |   2 +-
 configs/eval_post_ecapa_cos_scratch.yaml      |   4 +-
 configs/eval_post_xvector_plda_scratch.yaml   |   4 +-
 configs/eval_pre_ecapa_cos.yaml               |   4 +-
 configs/eval_pre_ecapa_plda.yaml              |   4 +-
 configs/eval_pre_xvector_cos.yaml             |   4 +-
 configs/eval_pre_xvector_plda.yaml            |   4 +-
 utils/config_primitives.py                    |  10 -
 17 files changed, 445 insertions(+), 168 deletions(-)
 delete mode 100644 utils/config_primitives.py

diff --git a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py
index 62fb888..753720c 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py
@@ -1,26 +1,70 @@
 from pathlib import Path
 import torch
+import ruamel.yaml as yaml
+from ruamel.yaml.representer import RoundTripRepresenter, SafeRepresenter
+from typing import Union
 
 
 class BaseAnonymizer:
+    """
+    Base class for speaker embedding anonymizers, defining the API,
+    that consists of the following methods:
+    - anonymize_embeddings
+    - to
+    """
+    def __init__(
+        self,
+        vec_type: str,
+        device: Union[str, torch.device, int, None],
+        suffix: str,
+        **kwargs,
+    ):
+        assert suffix[0] == "_", "Suffix must be a string and start with an underscore."
 
-    def __init__(self, vec_type='xvector', device=None, **kwargs):
         # Base class for speaker embedding anonymization.
         self.vec_type = vec_type
-        self.suffix = '_anon'
+        self.suffix = suffix
 
         if isinstance(device, torch.device):
             self.device = device
         elif isinstance(device, str):
             self.device = torch.device(device)
         elif isinstance(device, int):
-            self.device = torch.device(f'cuda:{device}')
+            self.device = torch.device(f"cuda:{device}")
         else:
-            self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+            self.device = (
+                torch.device("cuda")
+                if torch.cuda.is_available()
+                else torch.device("cpu")
+            )
 
-    def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
+        # ensure dumpability
+        self.kwargs = kwargs
+        self.kwargs["vec_type"] = self.vec_type
+        self.kwargs["device"] = str(self.device)
+        self.kwargs["suffix"] = self.suffix
+
+    def __repr__(self):
+        if hasattr(self, "kwargs"):
+            return f"{self.__class__.__name__}({self.kwargs})"
+        else:
+            return f"{self.__class__.__name__}()"
+
+    def to_yaml(self, representer: yaml.Representer):
+        # first get data into dict format
+        data = {f"!new:{type(self).__qualname__}": self.kwargs}
+        return_str = representer.represent_dict(data)
+        return return_str
+
+    def anonymize_embeddings(self, speaker_embeddings: torch.Tensor, emb_level: str = "spk") -> torch.Tensor:
         # Template method for anonymizing a dataset. Not implemented.
-        raise NotImplementedError('anonymize_data')
-    
+        raise NotImplementedError("anonymize_data")
+
     def to(self, device):
         self.device = device
+
+
+# necessary to make BaseAnonymizer and subclasses dumpable
+RoundTripRepresenter.add_multi_representer(
+    BaseAnonymizer, lambda representer, data: data.to_yaml(representer)
+)
diff --git a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
index f1af431..0fb5757 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
@@ -2,7 +2,9 @@
 import torch
 import numpy as np
 from scipy.spatial.distance import cosine
+from os import PathLike
 from tqdm import tqdm
+from typing import Union
 
 from .base_anon import BaseAnonymizer
 from ..speaker_embeddings import SpeakerEmbeddings
@@ -10,14 +12,48 @@
 
 
 class GANAnonymizer(BaseAnonymizer):
+    """
+        Implementation of the anonymizer proposed in the paper "Anonymizing
+        speech with generative adversarial networks to preserve speaker
+        privacy" (https://arxiv.org/pdf/2210.07002.pdf).
+    """
+    def __init__(
+        self,
+        vec_type: str = "xvector",
+        device: Union[str, torch.device, int] = "cuda:0",
+        model_name: Union[str, PathLike] = None,
+        vectors_file: Union[str, PathLike] = None,
+        sim_threshold: float = 0.7,
+        gan_model_path: Union[str, PathLike] = None,
+        num_sampled: int = 1000,
+        save_intermediate: bool = False,
+        suffix: str = '_anon',
+        **kwargs,
+    ):
+        """
+        Args:
+            vec_type: The type of the speaker embedding to anonymize. Valid
+                values are 'xvector', 'style-embed', 'ecapa'
+            device: The computation device to use for the anonymization.
+            model_name: The filename of the model used for the anonymization.
+                Defaults to 'gan_{vec_type}'.
+            vectors_file: The path to the file containing the GAN vectors.
+                Defaults to 'gan_vectors_{vec_type}.pt'.
+            sim_threshold: The minimum cosine similarity between the original
+                speaker embedding and the anonymized embedding.
+            gan_model_path: The path to the GAN model.
+            num_sampled: The number of GAN vectors to sample.
+            save_intermediate: If True, the GAN vectors and the unused indices
+                will be saved to files.
+            suffix: The suffix to append to the output files.
+        """
+        super().__init__(vec_type=vec_type, device=device, suffix=suffix)
 
-    def __init__(self, vec_type='xvector', device=None, model_name=None, vectors_file=None, sim_threshold=0.7,
-                 gan_model_path=None, num_sampled=1000, save_intermediate=False, **kwargs):
-        super().__init__(vec_type=vec_type, device=device)
-
-        self.model_name = model_name if model_name else f'gan_{vec_type}'
+        self.model_name = model_name if model_name else f"gan_{vec_type}"
         self.vectors_file = Path(vectors_file)
-        self.unused_indices_file = self.vectors_file.with_name(f'unused_indices_{self.vectors_file.name}')
+        self.unused_indices_file = self.vectors_file.with_name(
+            f"unused_indices_{self.vectors_file.name}"
+        )
         self.sim_threshold = sim_threshold
         self.save_intermediate = save_intermediate
         self.n = num_sampled
@@ -25,17 +61,32 @@ def __init__(self, vec_type='xvector', device=None, model_name=None, vectors_fil
         if self.vectors_file.is_file():
             self.gan_vectors = torch.load(self.vectors_file, map_location=self.device)
             if self.unused_indices_file.is_file():
-                self.unused_indices = torch.load(self.unused_indices_file, map_location='cpu')
+                self.unused_indices = torch.load(
+                    self.unused_indices_file, map_location="cpu"
+                )
             else:
                 self.unused_indices = np.arange(len(self.gan_vectors))
         else:
-            self.gan_vectors, self.unused_indices = self._generate_artificial_embeddings(gan_model_path, self.n)
+            (
+                self.gan_vectors,
+                self.unused_indices,
+            ) = self._generate_artificial_embeddings(gan_model_path, self.n)
 
-    def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
-        if emb_level == 'spk':
-            print(f'Anonymize embeddings of {len(speaker_embeddings)} speakers...')
-        elif emb_level == 'utt':
-            print(f'Anonymize embeddings of {len(speaker_embeddings)} utterances...')
+    def anonymize_embeddings(
+        self, speaker_embeddings: torch.Tensor, emb_level: str = "spk"
+    ):
+        """
+            Anonymize speaker embeddings using the GAN model.
+        Args:
+            speaker_embeddings: [n_embeddings, n_channels] Speaker
+                embeddings to be anonymized.
+            emb_level: Embedding level ('spk' for speaker level
+                or 'utt' for utterance level).
+        """
+        if emb_level == "spk":
+            print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
+        elif emb_level == "utt":
+            print(f"Anonymize embeddings of {len(speaker_embeddings)} utterances...")
 
         identifiers = []
         speakers = []
@@ -51,16 +102,22 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
             anon_vectors.append(anon_vec)
             genders.append(gender)
 
-        anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, emb_level=emb_level)
-        anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0),
-                                    speakers=speakers, genders=genders)
+        anon_embeddings = SpeakerEmbeddings(
+            vec_type=self.vec_type, device=self.device, emb_level=emb_level
+        )
+        anon_embeddings.set_vectors(
+            identifiers=identifiers,
+            vectors=torch.stack(anon_vectors, dim=0),
+            speakers=speakers,
+            genders=genders,
+        )
         if self.save_intermediate:
             torch.save(self.unused_indices, self.unused_indices_file)
 
         return anon_embeddings
 
-    def _generate_artificial_embeddings(self, gan_model_path, n):
-        print(f'Generate {n} artificial speaker embeddings...')
+    def _generate_artificial_embeddings(self, gan_model_path: Path, n: int):
+        print(f"Generate {n} artificial speaker embeddings...")
         generator = EmbeddingsGenerator(gan_path=gan_model_path, device=self.device)
         gan_vectors = generator.generate_embeddings(n=n)
         unused_indices = np.arange(len(gan_vectors))
@@ -70,7 +127,7 @@ def _generate_artificial_embeddings(self, gan_model_path, n):
             torch.save(unused_indices, self.unused_indices_file)
         return gan_vectors, unused_indices
 
-    def _select_gan_vector(self, spk_vec):
+    def _select_gan_vector(self, spk_vec: torch.Tensor):
         i = 0
         limit = 20
         while i < limit:
diff --git a/anonymization/modules/speaker_embeddings/anonymization/passthrough.py b/anonymization/modules/speaker_embeddings/anonymization/passthrough.py
index 742071f..d21545d 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/passthrough.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/passthrough.py
@@ -1,26 +1,36 @@
 from .base_anon import BaseAnonymizer
 import torch
+from typing import Union
+import ruamel.yaml as yaml
 
-class Passthrough(BaseAnonymizer):
 
-    def __init__(self, vec_type='xvector', device=None, **kwargs):
-        # Base class for speaker embedding anonymization.
-        self.vec_type = vec_type
-        self.suffix = '_res'
+class Passthrough(BaseAnonymizer):
+    """
+    A 'Passthrough' 'anonymizer' that does not anonymize the speaker embeddings.
+    """
 
-        if isinstance(device, torch.device):
-            self.device = device
-        elif isinstance(device, str):
-            self.device = torch.device(device)
-        elif isinstance(device, int):
-            self.device = torch.device(f'cuda:{device}')
-        else:
-            self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    def __init__(
+        self,
+        vec_type: str = "",
+        device: Union[str, torch.device, int] = "cuda:0",
+        suffix: str = "_res",
+        **kwargs
+    ):
+        super().__init__(vec_type, device, suffix, **kwargs)
 
-    def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
-        # no need to refer to emb_level, 
+    def anonymize_embeddings(
+        self, speaker_embeddings: torch.Tensor, emb_level: str = "spk"
+    ) -> torch.Tensor:
+        """
+        Returns the speaker embeddings unchanged.
+        """
+        # no need to refer to emb_level,
         # as extractor also yields spk-level or utt-level.
         return speaker_embeddings
-    
+
     def to(self, device):
-        self.device = device
+        """
+        Move the anonymizer to the given device. For the passthrough anonymizer,
+        this is a no-op, apart from setting the property.
+        """
+        super().to(device)
diff --git a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
index 6ef578e..3a21db1 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
@@ -3,6 +3,9 @@
 import torch
 import json
 from tqdm import tqdm
+from typing import Union
+
+from os import PathLike
 from sklearn.metrics.pairwise import cosine_distances
 from sklearn.preprocessing import minmax_scale, StandardScaler
 
@@ -12,42 +15,127 @@
 from ..speaker_embeddings import SpeakerEmbeddings
 from utils import transform_path
 
-REVERSED_GENDERS = {'m': 'f', 'f': 'm'}
+REVERSED_GENDERS = {
+    "m": "f", 
+    "f": "m"
+}
 
 
 class PoolAnonymizer(BaseAnonymizer):
+    """
+    An implementation of the 'Pool' anonymization method, that is based on the
+    primary baseline of the Voice Privacy Challenge 2020.
+
+    For every source x-vector, an anonymized x-vector is computed by finding 
+    the N farthest x-vectors in an external pool (LibriTTS train-other-500) 
+    according to the PLDA distance, and by averaging N∗ randomly selected
+    vectors among them. In the baseline, we use:
+        N = 200,
+        N∗ = 100.
+    """
+    def __init__(
+        self,
+        vec_type: str = "xvector",
+        device: Union[str, torch.device, int, None] = None,
+        model_name: str = None,
+        pool_data_dir: Union[str, PathLike] = "data/libritts_train_other_500",
+        pool_vec_path: Union[str, PathLike] = "original_speaker_embeddings/pool_embeddings",
+        N: int = 200,
+        N_star: int = 100,
+        distance: str = "plda",
+        cross_gender: bool = False,
+        proximity: bool = "farthest",
+        scaling: str = None,
+        stats_per_dim_path: Union[str, PathLike] = None,
+        distance_model_path: Union[str, PathLike] = "distances/plda/libritts_train_other_500_xvector",
+        embed_model_path: Union[str, PathLike] = None,
+        save_intermediate: bool = False,
+        suffix: str = "_anon",
+        **kwargs,
+    ):
+        """
+        Args:
+            vec_type (str): Type of the speaker embeddings, currently supported
+                are 'xvector', 'ecapa', 'style-embed'.
+
+            device (Union[str, torch.device, int, None]): Device to use for
+                the procedure, e.g. 'cpu', 'cuda', 'cuda:0', etc.
+            
+            model_name (str): Name of the model, used for distances that 
+                require a model (e.g., PLDA).
+
+            pool_data_dir (Union[str, PathLike]): Path to the audio data 
+            which will be used for x-vector pool extraction.
+
+            pool_vec_path (Union[str, PathLike]): Path to the stored
+                speaker embeddings of the pool.
+
+            N (int): Number of most 'fitting' vectors to consider.
+
+            N_star (int): Number of vectors to randomly select from the N most
+                'fitting' vectors, to compute the average.
+
+            distance (str): Distance measure, either 'plda' or 'cosine'.
+
+            cross_gender (bool): Whether to switch genders of the speakers
+                during anonymization. 
+
+            proximity (str): Proximity measure, determining which vectors in 
+                the pool are the 'fittest', can be either 'farthest', 
+                'nearest' or 'center'.
 
-    def __init__(self, vec_type='xvector', device=None, model_name=None, pool_data_dir='data/libritts_train_other_500',
-                 pool_vec_path='original_speaker_embeddings/pool_embeddings', N=200, N_star=100, distance='plda',
-                 cross_gender=False, proximity='farthest', scaling=None, stats_per_dim_path=None,
-                 distance_model_path='distances/plda/libritts_train_other_500_xvector',
-                 embed_model_dir=None, save_intermediate=False, **kwargs):
-        # Pool anonymization method based on the primary baseline of the Voice Privacy Challenge 2020.
-        # Given a speaker vector, the N most distant vectors in an external speaker pool are extracted,
-        # and an average of a random subset of N_star vectors is computed and taken as new speaker vector.
-        # Default distance measure is PLDA.
-        super().__init__(vec_type=vec_type, device=device)
-
-        self.model_name = model_name if model_name else f'pool_{vec_type}'
-
-        self.N = N  # number of most distant vectors to consider
-        self.N_star = N_star  # number of vectors to include in averaged vector
-        self.proximity = proximity  # proximity method, either 'farthest' (distant vectors), 'nearest', or 'closest'
-        self.cross_gender = cross_gender  # Whether to reverse the genders of the speakers
+            scaling (str): Scaling method to use, can be either 'minmax' or
+                'std'.
+
+            stats_per_dim_path (Union[str, PathLike]): Path to the file
+                containing the statistics for each dimension in the given
+                embedding type.
+
+            distance_model_path (Union[str, PathLike]): Path to the stored
+                distance model (required for PLDA).
+
+            embed_model_path (Union[str, PathLike]): Path to the directory
+                containing the speaker embedding model.
+
+            save_intermediate (bool): Whether to save intermediate results.
+
+            suffix (str): Suffix to append to the output folder names.
+
+        """
+        print(locals())
+        super().__init__(vec_type=vec_type, device=device, suffix=suffix)
+
+        self.model_name = model_name if model_name else f"pool_{vec_type}"
+
+        self.N = N 
+        self.N_star = N_star
+        self.proximity = proximity
+        self.cross_gender = cross_gender
         self.save_intermediate = save_intermediate
 
         # external speaker pool
-        self.pool_embeddings = self._load_pool_embeddings(pool_data_dir=Path(pool_data_dir),
-                                                          pool_vec_path=Path(pool_vec_path),
-                                                          embed_model_dir=Path(embed_model_dir))
-        self.pool_genders = {gender: [i for i, spk_gender in enumerate(self.pool_embeddings.genders)
-                                      if spk_gender == gender] for gender in set(self.pool_embeddings.genders)}
+        self.pool_embeddings = self._load_pool_embeddings(
+            pool_data_dir=Path(pool_data_dir).expanduser(),
+            pool_vec_path=Path(pool_vec_path).expanduser(),
+            embed_model_path=Path(embed_model_path).expanduser(),
+        )
+        self.pool_genders = {
+            gender: [
+                i
+                for i, spk_gender in enumerate(self.pool_embeddings.genders)
+                if spk_gender == gender
+            ]
+            for gender in set(self.pool_embeddings.genders)
+        }
 
         # distance model; PLDA model if distance == plda; None if distance == cosine
         self.distance = distance  # distance measure, either 'plda' or 'cosine'
-        if self.distance == 'plda':
-            self.distance_model = PLDAModel(train_embeddings=self.pool_embeddings,
-                                            results_path=Path(distance_model_path), save_plda=self.save_intermediate)
+        if self.distance == "plda":
+            self.distance_model = PLDAModel(
+                train_embeddings=self.pool_embeddings,
+                results_path=Path(distance_model_path),
+                save_plda=self.save_intermediate,
+            )
         else:
             self.distance_model = None
 
@@ -55,23 +143,32 @@ def __init__(self, vec_type='xvector', device=None, model_name=None, pool_data_d
         self.scaling = scaling
         self.stats_per_dim_path = stats_per_dim_path or Path()
 
-    def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_dir):
+    def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_path):
         print(pool_data_dir)
         if pool_vec_path.exists():
-            pool_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='spk', device=self.device)
+            pool_embeddings = SpeakerEmbeddings(
+                vec_type=self.vec_type, emb_level="spk", device=self.device
+            )
             pool_embeddings.load_vectors(pool_vec_path)
         else:
-            extraction_settings = {'vec_type': self.vec_type, 'emb_level': 'spk'}
-            emb_extractor = SpeakerExtraction(results_dir=pool_vec_path, model_dir=embed_model_dir, device=self.device,
-                                              settings=extraction_settings, save_intermediate=self.save_intermediate)
-            pool_embeddings = emb_extractor.extract_speakers(dataset_path=pool_data_dir, dataset_name='')
+            extraction_settings = {"vec_type": self.vec_type, "emb_level": "spk", "embed_model_path": embed_model_path}
+            emb_extractor = SpeakerExtraction(
+                results_dir=pool_vec_path,
+                devices=[self.device],
+                settings=extraction_settings,
+                save_intermediate=self.save_intermediate,
+            )
+            pool_embeddings = emb_extractor.extract_speakers(
+                dataset_path=pool_data_dir, dataset_name=""
+            )
         return pool_embeddings
 
-    def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
-        distance_matrix = self._compute_distances(vectors_a=self.pool_embeddings.vectors,
-                                                  vectors_b=speaker_embeddings.vectors)
+    def anonymize_embeddings(self, speaker_embeddings: torch.Tensor, emb_level: str = "spk"):
+        distance_matrix = self._compute_distances(
+            vectors_a=self.pool_embeddings.vectors, vectors_b=speaker_embeddings.vectors
+        )
 
-        print(f'Anonymize embeddings of {len(speaker_embeddings)} speakers...')
+        print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
         identifiers = []
         speakers = []
         anon_vectors = []
@@ -83,23 +180,37 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
             gender = speaker_embeddings.genders[i]
             distances_to_speaker = distance_matrix[:, i]
             candidates = self._get_pool_candidates(distances_to_speaker, gender)
-            selected_anon_pool = np.random.choice(candidates, self.N_star, replace=False)
-            anon_vec = torch.mean(self.pool_embeddings.speaker_vectors[selected_anon_pool], dim=0)
+            selected_anon_pool = np.random.choice(
+                candidates, self.N_star, replace=False
+            )
+            anon_vec = torch.mean(
+                self.pool_embeddings.speaker_vectors[selected_anon_pool], dim=0
+            )
             identifiers.append(identifier)
             speakers.append(speaker)
             anon_vectors.append(anon_vec)
-            genders.append(gender if not self.cross_gender else REVERSED_GENDERS[gender])
+            genders.append(
+                gender if not self.cross_gender else REVERSED_GENDERS[gender]
+            )
 
-        anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, emb_level=emb_level)
-        anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0),
-                                    speakers=speakers, genders=genders)
+        anon_embeddings = SpeakerEmbeddings(
+            vec_type=self.vec_type, device=self.device, emb_level=emb_level
+        )
+        anon_embeddings.set_vectors(
+            identifiers=identifiers,
+            vectors=torch.stack(anon_vectors, dim=0),
+            speakers=speakers,
+            genders=genders,
+        )
 
         return anon_embeddings
 
     def _compute_distances(self, vectors_a, vectors_b):
-        if self.distance == 'plda':
-            return 1 - self.distance_model.compute_distance(enrollment_vectors=vectors_a, trial_vectors=vectors_b)
-        elif self.distance == 'cosine':
+        if self.distance == "plda":
+            return 1 - self.distance_model.compute_distance(
+                enrollment_vectors=vectors_a, trial_vectors=vectors_b
+            )
+        elif self.distance == "cosine":
             return cosine_distances(X=vectors_a.cpu(), Y=vectors_b.cpu())
         else:
             return []
@@ -110,45 +221,49 @@ def _get_pool_candidates(self, distances, gender):
         else:
             distances = distances[self.pool_genders[gender]]
 
-        if self.proximity == 'farthest':
-            return np.argpartition(distances, -self.N)[-self.N:]
-        elif self.proximity == 'nearest':
-            return np.argpartition(distances, self.N)[:self.N]
-        elif self.proximity == 'center':
+        if self.proximity == "farthest":
+            return np.argpartition(distances, -self.N)[-self.N :]
+        elif self.proximity == "nearest":
+            return np.argpartition(distances, self.N)[: self.N]
+        elif self.proximity == "center":
             sorted_distances = np.sort(distances)
-            return sorted_distances[len(sorted_distances)//2:(len(sorted_distances)//2)+self.N]
+            return sorted_distances[
+                len(sorted_distances) // 2 : (len(sorted_distances) // 2) + self.N
+            ]
 
     def _load_scaling_ranges(self, stats_per_dim_path):
         if stats_per_dim_path and Path(stats_per_dim_path).exists():
             with open(stats_per_dim_path) as f:
                 dim_ranges = json.load(f)
-                return [(v['min'], v['max']) for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0]))]
+                return [
+                    (v["min"], v["max"])
+                    for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0]))
+                ]
         else:
-            raise FileNotFoundError(f'You need to specify a path to an existing file containing the statistics for '
-                                    f'each dimension in the given embedding type, '
-                                    f'stats_per_dim_path={stats_per_dim_path} is not valid!')
+            raise FileNotFoundError(
+                f"You need to specify a path to an existing file containing the statistics for "
+                f"each dimension in the given embedding type, "
+                f"stats_per_dim_path={stats_per_dim_path} is not valid!"
+            )
 
     def _scale_embeddings(self, embeddings):
         vectors = embeddings.vectors.cpu().numpy()
 
-        if self.scaling == 'minmax':
+        if self.scaling == "minmax":
             scaling_ranges = self._load_scaling_ranges(self.stats_per_dim_path)
             scaled_dims = []
             for i in range(len(scaling_ranges)):
-                scaled_dims.append(minmax_scale(vectors[:, i], scaling_ranges[i], axis=0))
+                scaled_dims.append(
+                    minmax_scale(vectors[:, i], scaling_ranges[i], axis=0)
+                )
 
             scaled_vectors = torch.tensor(np.array(scaled_dims)).T.to(self.device)
             embeddings.vectors = scaled_vectors
 
-        elif self.scaling == 'std':
+        elif self.scaling == "std":
             std_scaler = StandardScaler()
             std_scaler.fit(self.pool_embeddings.vectors.cpu().numpy())
             scaled_vectors = torch.tensor(std_scaler.transform(vectors))
             embeddings.vectors = scaled_vectors
 
         return embeddings
-
-# for every source x-vector, an anonymized x-vector is computed by finding the N farthest x-
-# vectors in an external pool (LibriTTS train-other-500) accord-
-# ing to the PLDA distance, and by averaging N ∗ randomly se-
-# lected vectors among them. In the baseline, we use N = 200 and N ∗ = 100
\ No newline at end of file
diff --git a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py
index 764e60d..98ffc84 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py
@@ -1,6 +1,8 @@
 import json
 from pathlib import Path
 import torch
+from os import PathLike
+from typing import Union
 import numpy as np
 
 from .base_anon import BaseAnonymizer
@@ -8,13 +10,42 @@
 
 
 class RandomAnonymizer(BaseAnonymizer):
-
-    def __init__(self, vec_type='xvector', device=None, model_name=None,  in_scale=False, stats_per_dim_path=None,
-                 **kwargs):
+    """
+        An anonymizer module that generates random vectors for each speaker or
+        utterance. The vectors are generated by sampling from a uniform
+        distribution for each dimension. The range of the uniform distribution
+        is determined by the minimum and maximum values of the original
+        speaker/utterance vectors.
+    """
+    def __init__(
+        self,
+        device: Union[str, torch.device, int, None],
+        vec_type: str = "xvector",
+        model_name: str = None,
+        in_scale: bool = False,
+        stats_per_dim_path: Union[str, PathLike] =None,
+        **kwargs,
+    ):
+        """
+        Args:
+            device: The computation device to use for the anonymization.
+            vec_type: The type of the speaker embedding to anonymize. Valid
+                values are 'xvector', 'style-embed', 'ecapa'
+            model_name: The name of the model used for the anonymization.
+                Defaults to 'random_{vec_type}'.
+            in_scale: If True, the anonymized vectors will be in the same
+                scale as the original vectors. Otherwise, the vectors will be
+                sampled from a uniform distribution with the same range for
+                each dimension.
+            stats_per_dim_path: The path to the json file containing the
+                minimum and maximum values for each dimension of the original
+                vectors. If None, the stats will be loaded from the file
+                'stats_per_dim.json'.
+        """
         super().__init__(vec_type=vec_type, device=device)
 
-        self.model_name = model_name if model_name else f'random_{vec_type}'
-        
+        self.model_name = model_name if model_name else f"random_{vec_type}"
+
         if in_scale:
             self.stats_per_dim_path = stats_per_dim_path
         else:
@@ -26,13 +57,23 @@ def scaling_ranges(self):
         # defer loading of stats until they are first needed
         # required after anonymizer initialization is delegated to HyperPyYAML
         if self.stats_per_dim_path is not None:
-            self._scaling_ranges = self._load_scaling_ranges(stats_per_dim_path=self.stats_per_dim_path)
+            self._scaling_ranges = self._load_scaling_ranges(
+                stats_per_dim_path=self.stats_per_dim_path
+            )
             self.stats_per_dim_path = None
         return self._scaling_ranges
 
-    def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
+    def anonymize_embeddings(self, speaker_embeddings, emb_level="spk"):
+        """
+            Anonymize speaker embeddings using random vectors.
+        Args:
+            speaker_embeddings: [n_embeddings, n_channels] Speaker
+                embeddings to be anonymized.
+            emb_level: Embedding level ('spk' for speaker level or 'utt' for
+                utterance level).
+        """
         if self.scaling_ranges:
-            print('Anonymize vectors in scale!')
+            print("Anonymize vectors in scale!")
             return self._anonymize_data_in_scale(speaker_embeddings)
         else:
             identifiers = []
@@ -40,24 +81,38 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
             speakers = speaker_embeddings.original_speakers
             genders = speaker_embeddings.genders
             for identifier, vector in speaker_embeddings:
-                mask = torch.zeros(vector.shape[0]).float().random_(-40, 40).to(self.device)
+                mask = (
+                    torch.zeros(vector.shape[0])
+                    .float()
+                    .random_(-40, 40)
+                    .to(self.device)
+                )
                 anon_vec = vector * mask
                 identifiers.append(identifier)
                 anon_vectors.append(anon_vec)
 
-            anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device)
-            anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0),
-                                        genders=genders, speakers=speakers)
+            anon_embeddings = SpeakerEmbeddings(
+                vec_type=self.vec_type, device=self.device, emb_level=emb_level
+            )
+            anon_embeddings.set_vectors(
+                identifiers=identifiers,
+                vectors=torch.stack(anon_vectors, dim=0),
+                genders=genders,
+                speakers=speakers,
+            )
 
             return anon_embeddings
 
     def _load_scaling_ranges(self, stats_per_dim_path):
         if stats_per_dim_path is None:
-            stats_per_dim_path = Path('stats_per_dim.json')
+            stats_per_dim_path = Path("stats_per_dim.json")
 
         with open(stats_per_dim_path) as f:
             dim_ranges = json.load(f)
-            return [(v['min'], v['max']) for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0]))]
+            return [
+                (v["min"], v["max"])
+                for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0]))
+            ]
 
     def _anonymize_data_in_scale(self, speaker_embeddings):
         identifiers = []
@@ -66,13 +121,18 @@ def _anonymize_data_in_scale(self, speaker_embeddings):
         genders = speaker_embeddings.genders
 
         for identifier, vector in speaker_embeddings:
-            anon_vec = torch.tensor([np.random.uniform(*dim_range)
-                                     for dim_range in self.scaling_ranges]).to(self.device)
+            anon_vec = torch.tensor(
+                [np.random.uniform(*dim_range) for dim_range in self.scaling_ranges]
+            ).to(self.device)
             identifiers.append(identifier)
             anon_vectors.append(anon_vec)
 
         anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device)
-        anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), genders=genders,
-                                    speakers=speakers)
+        anon_embeddings.set_vectors(
+            identifiers=identifiers,
+            vectors=torch.stack(anon_vectors, dim=0),
+            genders=genders,
+            speakers=speakers,
+        )
 
-        return anon_embeddings
\ No newline at end of file
+        return anon_embeddings
diff --git a/configs/anon/ims_gan.yaml b/configs/anon/ims_gan.yaml
index 8e19edc..270baaa 100644
--- a/configs/anon/ims_gan.yaml
+++ b/configs/anon/ims_gan.yaml
@@ -1,9 +1,8 @@
 !new:anonymization.modules.speaker_embeddings.anonymization.gan_anon.GANAnonymizer
-  models_dir: null # overridden by main .yaml
-  vec_type: null   # overriden by main .yaml
-  save_intermediate: null   # overriden by main .yaml
-  model_name: null   # overriden by constructor
-  vectors_file: !ref <models_dir>/anonymization/style-embed_wgan.pt
+  models_dir: !PLACEHOLDER # overridden by main .yaml
+  vec_type: !PLACEHOLDER   # overriden by main .yaml
+  save_intermediate: !PLACEHOLDER   # overriden by main .yaml
+  vectors_file: !ref <models_dir>/anonymization/<vec_type>_wgan.pt
   gan_model_path: !ref <models_dir>/anonymization/gan_<vec_type>/<vec_type>_wgan.pt
   num_sampled: 5000
   sim_threshold: 0.7
\ No newline at end of file
diff --git a/configs/anon/pool.yaml b/configs/anon/pool.yaml
index ccdfd8d..5bad248 100644
--- a/configs/anon/pool.yaml
+++ b/configs/anon/pool.yaml
@@ -1,11 +1,13 @@
-# pool_anon_settings are only used if anon_method == pool
-pool_data_dir: !ref <data_dir>/libritts_train_other_500
-pool_vec_path: !ref <vectors_dir>/style-embed_spk-level/pool_embeddings
-N: 200
-N_star: 100
-distance: plda    # possible: plda, cosine
-plda_dir: !ref <models_dir>/distances/plda/libritts_train_other_500_xvector
-cross_gender: false
-proximity: farthest    # possible: farthest, nearest, center
-scaling: maxmin    # possible: none, maxmin, mean
-stats_per_dim_path: !ref <models_dir>/anonymization/stats_per_dim.json
\ No newline at end of file
+!new:anonymization.modules.speaker_embeddings.anonymization.pool_anon.PoolAnonymizer
+  data_dir: !PLACEHOLDER # to be overridden by the main config
+  embed_model_path: !PLACEHOLDER # to be overridden by the main config
+  pool_data_dir: !ref <data_dir>/libritts_train_other_500
+  pool_vec_path: !ref <vectors_dir>/style-embed_spk-level/pool_embeddings
+  N: 200
+  N_star: 100
+  distance: plda    # possible: plda, cosine
+  plda_dir: !ref <models_dir>/distances/plda/libritts_train_other_500_xvector
+  cross_gender: false
+  proximity: farthest    # possible: farthest, nearest, center
+  scaling: maxmin    # possible: none, maxmin, mean
+  stats_per_dim_path: !ref <models_dir>/anonymization/stats_per_dim.json
\ No newline at end of file
diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml
index ff9effe..87760af 100644
--- a/configs/anon_ims_sttts_pc.yaml
+++ b/configs/anon_ims_sttts_pc.yaml
@@ -1,10 +1,10 @@
 root_dir : ..
-data_dir: !new:utils.config_primitives.Todo  # TODO adjust path, e.g. <root_dir>/data
+data_dir: !PLACEHOLDER
 save_intermediate: true
 save_output: true
 
-results_dir: !new:utils.config_primitives.Todo # TODO adjust path, e.g. <root_dir>/results
-models_dir:  !new:utils.config_primitives.Todo  # TODO adjust path, e.g. <root_dir>/models
+results_dir: !PLACEHOLDER # TODO adjust path, e.g. <root_dir>/results
+models_dir:  !PLACEHOLDER  # TODO adjust path, e.g. <root_dir>/models
 vectors_dir: !ref <root_dir>/results/original_speaker_embeddings
 
 force_compute_all: false
diff --git a/configs/eval_gvd_both.yaml b/configs/eval_gvd_both.yaml
index 10735bf..c0b8b91 100644
--- a/configs/eval_gvd_both.yaml
+++ b/configs/eval_gvd_both.yaml
@@ -38,7 +38,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: TODO  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
diff --git a/configs/eval_post_ecapa_cos_ft.yaml b/configs/eval_post_ecapa_cos_ft.yaml
index 9e06d69..610d80a 100644
--- a/configs/eval_post_ecapa_cos_ft.yaml
+++ b/configs/eval_post_ecapa_cos_ft.yaml
@@ -72,7 +72,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: TODO  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: !PLACEHOLDER  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_anon_ft  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/configs/eval_post_ecapa_cos_scratch.yaml b/configs/eval_post_ecapa_cos_scratch.yaml
index cc9ddae..05aa510 100644
--- a/configs/eval_post_ecapa_cos_scratch.yaml
+++ b/configs/eval_post_ecapa_cos_scratch.yaml
@@ -41,7 +41,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: TODO  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
@@ -72,7 +72,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: TODO  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: !PLACEHOLDER  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_anon_scratch  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/configs/eval_post_xvector_plda_scratch.yaml b/configs/eval_post_xvector_plda_scratch.yaml
index e46d9bd..9326102 100644
--- a/configs/eval_post_xvector_plda_scratch.yaml
+++ b/configs/eval_post_xvector_plda_scratch.yaml
@@ -41,7 +41,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: TODO  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
@@ -72,7 +72,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: TODO  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: !PLACEHOLDER  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_anon_scratch  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml
index ca444ab..f81a568 100644
--- a/configs/eval_pre_ecapa_cos.yaml
+++ b/configs/eval_pre_ecapa_cos.yaml
@@ -11,7 +11,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: !new:utils.config_primitives.Todo  # TODO path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
@@ -42,7 +42,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: !new:utils.config_primitives.Todo # TODO path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: !PLACEHOLDER  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_pre  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/configs/eval_pre_ecapa_plda.yaml b/configs/eval_pre_ecapa_plda.yaml
index a9cf2a6..f27d44a 100644
--- a/configs/eval_pre_ecapa_plda.yaml
+++ b/configs/eval_pre_ecapa_plda.yaml
@@ -41,7 +41,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: TODO  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
@@ -72,7 +72,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: TODO  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: !PLACEHOLDER  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_pre  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/configs/eval_pre_xvector_cos.yaml b/configs/eval_pre_xvector_cos.yaml
index 4c3c7d2..1c9fc4f 100644
--- a/configs/eval_pre_xvector_cos.yaml
+++ b/configs/eval_pre_xvector_cos.yaml
@@ -41,7 +41,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: TODO  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
@@ -72,7 +72,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: TODO  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: !PLACEHOLDER  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_pre  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/configs/eval_pre_xvector_plda.yaml b/configs/eval_pre_xvector_plda.yaml
index 68986fc..c7e3d04 100644
--- a/configs/eval_pre_xvector_plda.yaml
+++ b/configs/eval_pre_xvector_plda.yaml
@@ -41,7 +41,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: TODO  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
@@ -72,7 +72,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: TODO  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: !PLACEHOLDER  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_pre  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/utils/config_primitives.py b/utils/config_primitives.py
deleted file mode 100644
index dc715ed..0000000
--- a/utils/config_primitives.py
+++ /dev/null
@@ -1,10 +0,0 @@
-class IncompleteConfigException(Exception):
-    pass
-
-
-class Todo:
-    def __init__(self, *args, **kwargs):
-        raise IncompleteConfigException(
-            'You must complete the config files before execution. ' 
-            'Find lines with "TODO" and enter the appropriate information.'
-        )

From 3e902231914ad7fa61f1c707618f6a7d9e1645b5 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 11:37:29 +0100
Subject: [PATCH 14/33] Makefile now downloads and extracts the pretrained
 models for evaluation too

---
 Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 714666d..fa31058 100644
--- a/Makefile
+++ b/Makefile
@@ -34,14 +34,18 @@ pretrained_models: ## downloads the pretrained models from IMS repositories
 	@wget -q -O models/anonymization.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/anonymization.zip
 	@wget -q -O models/asr.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/asr.zip
 	@wget -q -O models/tts.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/tts.zip
+	@wget -q -O models/pre_eval_models.zip https://github.com/DigitalPhonetics/VoicePAT/releases/download/v1/pre_eval_models.zip
 	@unzip -oq models/asr.zip -d models
 	@unzip -oq models/tts.zip -d models
 	@unzip -oq models/anonymization.zip -d models
+	@unzip -oq models/pre_eval_models.zip -d models
+	@mkdir evaluation/utility/asr/exp
+	@ln -srf evaluation/utility/asr/exp exp
 	@rm models/*.zip
 
 
 $(ENV_NAME): environment.yaml
-	@($(CONDA) env create -f $< -p ./$@ && @echo Installation complete, please run `conda-develop .` once.) || $(CONDA) env update -f $< -p ./$@
+	@($(CONDA) env create -f $< -p ./$@ && echo Installation complete, please run `conda develop .` once.) || $(CONDA) env update -f $< -p ./$@
 	@conda config --set env_prompt '($$(basename {default_env})) '
 	@(cat .gitignore | grep -q $(ENV_NAME)) || echo $(ENV_NAME) >> .gitignore
 

From 77fd42f2fbd7a2117fb6ff3c23dbbfcb2fbdf103 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 11:37:43 +0100
Subject: [PATCH 15/33] Updates to the environment

---
 environment.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/environment.yaml b/environment.yaml
index 058b844..7d9a926 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -30,18 +30,20 @@ dependencies:
   - pyyaml
   - click
   - typer
+  - typer-cli
   - tqdm
   - openpyxl
   - librosa
   - resampy
   - python-sounddevice
-  - pytorch::pytorch-cuda  
+  - pytorch::pytorch-cuda
   - pytorch::ignite
   - pytorch::torchaudio
+  - cudatoolkit-dev
   - tensorboardx
   - tensorboard
   - optuna
   - hydra-core
   - typeguard==2.13.3
   - conda-build
-  - torchvision # only to prevent warnings
\ No newline at end of file
+  - torchvision # only to prevent warnings

From 714f1e6797944bc62d01b3ae1fde4390409137ed Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 15:50:15 +0100
Subject: [PATCH 16/33] Minor fixes to run_evaluation.sh - Use absolute paths
 for `asr.sh` invocation - Improved documentation

---
 run_evaluation.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/run_evaluation.py b/run_evaluation.py
index 65213bc..d81ab07 100644
--- a/run_evaluation.py
+++ b/run_evaluation.py
@@ -3,6 +3,7 @@
 from argparse import ArgumentParser
 from pathlib import Path
 import pandas as pd
+from typing import List
 
 parser = ArgumentParser()
 parser.add_argument('--config', default='config_eval.yaml')
@@ -210,15 +211,16 @@ def get_similarity_matrix(vd_model, out_dir, exp_name, segments_folder):
         print(f'{trial} gvd={gvd_value}')
 
 
-def asr_train(params, libri_dir, model_name, model_dir, anon_data_suffix):
+def asr_train(params: dict, libri_dir: Path, model_name: str, model_dir: Path, anon_data_suffix: str):
     print(f'Train ASR model: {model_dir}')
     exp_dir = Path('exp', model_name)
+    libri_dir = Path(libri_dir).expanduser() # could be relative to userdir
     ngpu = min(params.get('num_gpus', 0), torch.cuda.device_count())  # cannot use more gpus than available
 
     train_params = [
         '--lang', 'en',
         '--ngpu', str(ngpu),
-        '--expdir', str(exp_dir),
+        '--expdir', str(exp_dir.absolute()),
         '--use_lm', 'false',
         '--nbpe', '5000',
         '--num_utt', str(params['num_utt']),
@@ -233,11 +235,11 @@ def asr_train(params, libri_dir, model_name, model_dir, anon_data_suffix):
     asr_config = 'conf/train_asr_transformer.yaml'
 
     if params.get('anon', False):
-        local_data_opts = ' '.join([str(libri_dir), str(params['train_data_dir']), anon_data_suffix])
+        local_data_opts = ' '.join([str(libri_dir.absolute()), str(params['train_data_dir'].absolute()), anon_data_suffix])
         train_set = f'train_clean_360_{anon_data_suffix}'
         if params.get('finetuning', False) is True:
             asr_config = 'conf/train_asr_transformer_anon.yaml'
-            train_params.extend(['--pretrained_model', f'{str(params["pretrained_model"])}/valid.acc.ave.pth'])
+            train_params.extend(['--pretrained_model', f'{str(params["pretrained_model"].absolute())}/valid.acc.ave.pth'])
     else:
         local_data_opts = str(libri_dir)
         train_set = 'train_clean_360'
@@ -255,20 +257,20 @@ def asr_train(params, libri_dir, model_name, model_dir, anon_data_suffix):
     os.chdir(cwd)
 
 
-def asr_eval_sh(eval_datasets, eval_data_dir, params, model_path, libri_dir, anon_data_suffix):
+def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_path, libri_dir, anon_data_suffix):
     print(f'Use ASR model for evaluation: {model_path}')
     test_sets = []
 
     for asr_dataset in eval_datasets:
         anon_asr_dataset = f'{asr_dataset}_{anon_data_suffix}'
-        test_sets.append(str(eval_data_dir / asr_dataset))
-        test_sets.append(str(eval_data_dir / anon_asr_dataset))
+        test_sets.append(str((eval_data_dir / asr_dataset).absolute()))
+        test_sets.append(str((eval_data_dir / anon_asr_dataset).absolute()))
 
     ngpu = min(params.get('num_gpus', 0), torch.cuda.device_count())  # cannot use more gpus than available
 
     inference_params = [
         '--ngpu', str(ngpu),
-        '--expdir', str(model_path),
+        '--expdir', str(model_path.absolute()),
         '--asr_exp', str(model_path),
         '--use_lm', 'true',
         '--local_data_opts', str(libri_dir),
@@ -299,6 +301,9 @@ def asr_eval_sh(eval_datasets, eval_data_dir, params, model_path, libri_dir, ano
     eval_data_dir = params['eval_data_dir']
     anon_suffix = params['anon_data_suffix']
 
+    # make sure given paths exist
+    assert eval_data_dir.exists(), f'{eval_data_dir} does not exist'
+
     if 'privacy' in eval_steps:
         if 'asv' in eval_steps['privacy']:
             asv_params = params['privacy']['asv']

From 8a6dd3db29f8159f22dd5d82c6a6d67faf020652 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 18:30:18 +0100
Subject: [PATCH 17/33] Use espnet python package for ASR eval

---
 evaluation/utility/asr/path.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/evaluation/utility/asr/path.sh b/evaluation/utility/asr/path.sh
index 01d36dd..77c850a 100755
--- a/evaluation/utility/asr/path.sh
+++ b/evaluation/utility/asr/path.sh
@@ -1,4 +1,4 @@
-MAIN_ROOT=~/espnet/  # TODO: change this to the path to your ESPnet installation
+MAIN_ROOT=./voicepat_env/lib/python3.11/site-packages/espnet  # TODO: change this to the path to your ESPnet installation
 
 export PATH=$PWD/utils/:$PATH
 export LC_ALL=C
@@ -8,7 +8,12 @@ if [ -f "${MAIN_ROOT}"/tools/activate_python.sh ]; then
 else
     echo "[INFO] "${MAIN_ROOT}"/tools/activate_python.sh is not present"
 fi
+
+if [ -f "${MAIN_ROOT}"/tools/extra_path.sh ]; then
 . "${MAIN_ROOT}"/tools/extra_path.sh
+else
+    echo "[INFO] "${MAIN_ROOT}"/tools/extra_path.sh is not present"
+fi
 
 export OMP_NUM_THREADS=1
 

From dd159e1368790d78b7146319310f8914f29f4b76 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 18:34:57 +0100
Subject: [PATCH 18/33] Use tqdm to display synthesis progress

---
 anonymization/modules/tts/speech_synthesis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anonymization/modules/tts/speech_synthesis.py b/anonymization/modules/tts/speech_synthesis.py
index 4ca3ebc..911e68c 100644
--- a/anonymization/modules/tts/speech_synthesis.py
+++ b/anonymization/modules/tts/speech_synthesis.py
@@ -119,7 +119,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non
                 with Pool(processes=num_processes) as pool:
                     job_params = zip(instances, self.tts_models, repeat(dataset_results_dir), sleeps,
                                      repeat(text_is_phones), repeat(self.save_output))
-                    new_wavs = pool.starmap(synthesis_job, job_params)
+                    new_wavs = pool.starmap(tqdm(synthesis_job), job_params)
 
                 for new_wav_dict in new_wavs:
                     wavs.update(new_wav_dict)

From 035ad5a2b855a7271d2454592dd72261ceeb8b9f Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 18:48:09 +0100
Subject: [PATCH 19/33] Improved documentation for `stts_pipeline.py`

---
 anonymization/pipelines/sttts_pipeline.py | 182 +++++++++++++++-------
 1 file changed, 123 insertions(+), 59 deletions(-)

diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py
index 53c542a..10f5a1b 100644
--- a/anonymization/pipelines/sttts_pipeline.py
+++ b/anonymization/pipelines/sttts_pipeline.py
@@ -1,105 +1,169 @@
 from pathlib import Path
 from datetime import datetime
 
-from anonymization.modules import SpeechRecognition, SpeechSynthesis, ProsodyExtraction, ProsodyAnonymization, SpeakerExtraction, \
-    SpeakerAnonymization
+from anonymization.modules import (
+    SpeechRecognition,
+    SpeechSynthesis,
+    ProsodyExtraction,
+    ProsodyAnonymization,
+    SpeakerExtraction,
+    SpeakerAnonymization,
+)
+import typing
 from utils import prepare_evaluation_data, save_yaml
 
 
 class STTTSPipeline:
-    """
-    This pipeline consists of:
-          - ASR -> phone sequence                             -
-    input - (prosody extractor -> prosody anonymizer)         - TTS -> output
-          - speaker embedding extractor -> speaker anonymizer -
-    """
-
-    def __init__(self, config, force_compute_all, devices):
+    def __init__(self, config: dict, force_compute: bool, devices: list):
+        """
+        Instantiates a STTTSPipeline with the complete feature extraction,
+        modification and resynthesis.
+
+        This pipeline consists of:
+              - ASR -> phone sequence                    -
+        input - (prosody extr. -> prosody anon.)         - TTS -> output
+              - speaker embedding extr. -> speaker anon. -
+
+        Args:
+            config (dict): a configuration dictionary, e.g., see anon_ims_sttts_pc.yaml
+            force_compute (bool): if True, forces re-computation of 
+                all steps. otherwise uses saved results.
+            devices (list): a list of torch-interpretable devices
+        """
         self.config = config
-        model_dir = Path(config.get('models_dir', 'models'))
-        vectors_dir = Path(config.get('vectors_dir', 'original_speaker_embeddings'))
-        self.results_dir = Path(config.get('results_dir', 'results'))
-        self.data_dir = Path(config['data_dir']) if 'data_dir' in config else None
-        save_intermediate = config.get('save_intermediate', True)
+        model_dir = Path(config.get("models_dir", "models"))
+        vectors_dir = Path(config.get("vectors_dir", "original_speaker_embeddings"))
+        self.results_dir = Path(config.get("results_dir", "results"))
+        self.data_dir = Path(config["data_dir"]) if "data_dir" in config else None
+        save_intermediate = config.get("save_intermediate", True)
 
-
-        modules_config = config['modules']
+        modules_config = config["modules"]
 
         # ASR component
-        self.speech_recognition = SpeechRecognition(devices=devices, save_intermediate=save_intermediate,
-                                                    settings=modules_config['asr'], force_compute=force_compute_all)
+        self.speech_recognition = SpeechRecognition(
+            devices=devices,
+            save_intermediate=save_intermediate,
+            settings=modules_config["asr"],
+            force_compute=force_compute,
+        )
 
         # Speaker component
-        self.speaker_extraction = SpeakerExtraction(devices=devices,
-                                                    save_intermediate=save_intermediate,
-                                                    settings=modules_config['speaker_embeddings'],
-                                                    force_compute=force_compute,
-                                                    )
-        self.speaker_anonymization = SpeakerAnonymization(vectors_dir=vectors_dir, device=devices[0],
-                                                            save_intermediate=save_intermediate,
-                                                            settings=modules_config['speaker_embeddings'],
-                                                            force_compute=force_compute)
+        self.speaker_extraction = SpeakerExtraction(
+            devices=devices,
+            save_intermediate=save_intermediate,
+            settings=modules_config["speaker_embeddings"],
+            force_compute=force_compute,
+        )
+        self.speaker_anonymization = SpeakerAnonymization(
+            vectors_dir=vectors_dir,
+            device=devices[0],
+            save_intermediate=save_intermediate,
+            settings=modules_config["speaker_embeddings"],
+            force_compute=force_compute,
+        )
 
         # Prosody component
-        if 'prosody' in modules_config:
-            self.prosody_extraction = ProsodyExtraction(device=devices[0], save_intermediate=save_intermediate,
-                                                        settings=modules_config['prosody'],
-                                                        force_compute=force_compute)
-            if 'anonymizer' in modules_config['prosody']:
-                self.prosody_anonymization = ProsodyAnonymization(save_intermediate=save_intermediate,
-                                                                  settings=modules_config['prosody'],
-                                                                  force_compute=force_compute)
+        if "prosody" in modules_config:
+            self.prosody_extraction = ProsodyExtraction(
+                device=devices[0],
+                save_intermediate=save_intermediate,
+                settings=modules_config["prosody"],
+                force_compute=force_compute,
+            )
+            if "anonymizer" in modules_config["prosody"]:
+                self.prosody_anonymization = ProsodyAnonymization(
+                    save_intermediate=save_intermediate,
+                    settings=modules_config["prosody"],
+                    force_compute=force_compute,
+                )
             else:
                 self.prosody_anonymization = None
         else:
             self.prosody_extraction = None
 
         # TTS component
-        self.speech_synthesis = SpeechSynthesis(devices=[devices[0]], settings=modules_config['tts'],
-                                                model_dir=model_dir, save_output=config.get('save_output', True),
-                                                force_compute=force_compute,
-                                                )
-
-    def run_anonymization_pipeline(self, datasets, prepare_results=True):
+        self.speech_synthesis = SpeechSynthesis(
+            devices=devices,
+            settings=modules_config["tts"],
+            model_dir=model_dir,
+            save_output=config.get("save_output", True),
+            force_compute=force_compute,
+        )
+
+    def run_anonymization_pipeline(
+        self, 
+        datasets: typing.Dict[str, Path], 
+        prepare_results: bool = True,
+    ):
+        """
+            Runs the anonymization algorithm on the given datasets. Optionally
+            prepares the results such that the evaluation pipeline 
+            can interpret them.
+
+            Args:
+                datasets (dict of str -> Path): The datasets on which the 
+                    anonymization pipeline should be runned on. These dataset 
+                    will be processed sequentially.
+                prepare_results (bool): if True, the resulting anonymization 
+                    .wavs are prepared for evaluation
+        """
         anon_wav_scps = {}
 
         for i, (dataset_name, dataset_path) in enumerate(datasets.items()):
-            print(f'{i + 1}/{len(datasets)}: Processing {dataset_name}...')
+            print(f"{i + 1}/{len(datasets)}: Processing {dataset_name}...")
             # Step 1: Recognize speech, extract speaker embeddings, extract prosody
-            texts = self.speech_recognition.recognize_speech(dataset_path=dataset_path, dataset_name=dataset_name)
-            spk_embeddings = self.speaker_extraction.extract_speakers(dataset_path=dataset_path,
-                                                                      dataset_name=dataset_name)
+            texts = self.speech_recognition.recognize_speech(
+                dataset_path=dataset_path, dataset_name=dataset_name
+            )
+            spk_embeddings = self.speaker_extraction.extract_speakers(
+                dataset_path=dataset_path, dataset_name=dataset_name
+            )
 
             if self.prosody_extraction:
-                prosody = self.prosody_extraction.extract_prosody(dataset_path=dataset_path, dataset_name=dataset_name,
-                                                                  texts=texts)
+                prosody = self.prosody_extraction.extract_prosody(
+                    dataset_path=dataset_path, dataset_name=dataset_name, texts=texts
+                )
             else:
                 prosody = None
 
             # Step 2: Anonymize speaker, change prosody
-            anon_embeddings = self.speaker_anonymization.anonymize_embeddings(speaker_embeddings=spk_embeddings,dataset_name=dataset_name)
+            anon_embeddings = self.speaker_anonymization.anonymize_embeddings(
+                speaker_embeddings=spk_embeddings, dataset_name=dataset_name
+            )
 
             if self.prosody_anonymization:
-                anon_prosody = self.prosody_anonymization.anonymize_prosody(prosody=prosody)
+                anon_prosody = self.prosody_anonymization.anonymize_prosody(
+                    prosody=prosody
+                )
             else:
                 anon_prosody = prosody
 
             # Step 3: Synthesize
-            wav_scp = self.speech_synthesis.synthesize_speech(dataset_name=dataset_name, texts=texts,
-                                                              speaker_embeddings=anon_embeddings,
-                                                              prosody=anon_prosody, emb_level=anon_embeddings.emb_level)
+            wav_scp = self.speech_synthesis.synthesize_speech(
+                dataset_name=dataset_name,
+                texts=texts,
+                speaker_embeddings=anon_embeddings,
+                prosody=anon_prosody,
+                emb_level=anon_embeddings.emb_level,
+            )
             anon_wav_scps[dataset_name] = wav_scp
-            print('Done')
+            print("Done")
 
         if prepare_results:
             if self.speaker_anonymization:
                 anon_vectors_path = self.speaker_anonymization.results_dir
             else:
                 anon_vectors_path = self.speaker_extraction.results_dir
-            now = datetime.strftime(datetime.today(), '%d-%m-%y_%H:%M')
-            prepare_evaluation_data(dataset_dict=datasets, anon_wav_scps=anon_wav_scps,
-                                    anon_vectors_path=anon_vectors_path, anon_suffix=self.speaker_anonymization.suffix,
-                                    output_path=self.results_dir / 'formatted_data' / now)
-            save_yaml(self.config, self.results_dir / 'formatted_data' / now / 'config.yaml')
+            now = datetime.strftime(datetime.today(), "%d-%m-%y_%H:%M")
+            prepare_evaluation_data(
+                dataset_dict=datasets,
+                anon_wav_scps=anon_wav_scps,
+                anon_vectors_path=anon_vectors_path,
+                anon_suffix=self.speaker_anonymization.suffix,
+                output_path=self.results_dir / "formatted_data" / now,
+            )
+            save_yaml(
+                self.config, self.results_dir / "formatted_data" / now / "config.yaml"
+            )
 
         return anon_wav_scps

From f8a36fa3c88367be5f933f9cfd03022401868e72 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 20:03:22 +0100
Subject: [PATCH 20/33] Add conda recipe for SCTK, required for seamlessly
 installed ESPNet

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index fa31058..2717fe6 100644
--- a/Makefile
+++ b/Makefile
@@ -17,10 +17,10 @@ endif
 ##@ INSTALLATION
 ###############################
 
-install: $(ENV_NAME) ## performs the installation. Currently the only step is to install the conda environment
-
-espnet: ## installs ESPNet
-	echo Deactivated
+install: $(ENV_NAME) espnet ## performs the installation. Currently the only step is to install the conda environment
+	@git clone https://github.com/egaznep/sctk
+	@conda build sctk
+	@conda install --use-local sctk
 
 uninstall:
 	@rm -rf $(ENV_NAME)

From 76928c57dfbae926a5ad5a91c5110c34d89c8c71 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 20:03:37 +0100
Subject: [PATCH 21/33] Fix missing entry in `pool.yaml`

---
 configs/anon/pool.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configs/anon/pool.yaml b/configs/anon/pool.yaml
index 5bad248..b5857d5 100644
--- a/configs/anon/pool.yaml
+++ b/configs/anon/pool.yaml
@@ -1,6 +1,7 @@
 !new:anonymization.modules.speaker_embeddings.anonymization.pool_anon.PoolAnonymizer
   data_dir: !PLACEHOLDER # to be overridden by the main config
   embed_model_path: !PLACEHOLDER # to be overridden by the main config
+  vectors_dir: !PLACEHOLDER # to be overridden by the main config
   pool_data_dir: !ref <data_dir>/libritts_train_other_500
   pool_vec_path: !ref <vectors_dir>/style-embed_spk-level/pool_embeddings
   N: 200

From a7439c598ecad996d4b843855425d83d5a4056ad Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 20:25:41 +0100
Subject: [PATCH 22/33] Simplify model creation for `SpeakerExtraction` and
 `SpeechRecognition`

---
 .../modules/speaker_embeddings/speaker_extraction.py       | 5 +----
 anonymization/modules/text/speech_recognition.py           | 7 ++-----
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py
index 5ce6cfb..38faa49 100644
--- a/anonymization/modules/speaker_embeddings/speaker_extraction.py
+++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py
@@ -44,10 +44,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode
             'model_path': self.embed_model_path,
         }
 
-        if self.n_processes > 1:
-            self.extractors = None
-        else:
-            self.extractors = create_extractors(hparams=self.model_hparams, device=self.devices[0])
+        self.extractors = [create_extractors(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(n_processes*len(devices)))]
 
     def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None):
         dataset_name = dataset_name if dataset_name is not None else dataset_path.name
diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py
index bf4dbf6..1a14f45 100644
--- a/anonymization/modules/text/speech_recognition.py
+++ b/anonymization/modules/text/speech_recognition.py
@@ -33,11 +33,8 @@ def __init__(self, devices, settings, results_dir=None, save_intermediate=True,
             if self.save_intermediate:
                 raise ValueError('Results dir must be specified in parameters or settings!')
 
-        self.asr_model = create_model_instance(hparams=self.model_hparams, device=devices[0])
-        self.is_phones = (self.asr_model.output == 'phones')
-
-        if self.n_processes > 1:
-            self.asr_model = None
+        self.asr_models = [create_model_instance(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(n_processes*len(devices)))]
+        self.is_phones = (self.asr_models[0].output == 'phones')
 
     def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None):
         dataset_name = dataset_name if dataset_name else dataset_path.name

From 98362ae1a8e9410a30617cee7a394b8d50937d40 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 22:15:18 +0100
Subject: [PATCH 23/33] Fix minor bug 'force_compute_all' not a vaild argument

---
 run_anonymization.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/run_anonymization.py b/run_anonymization.py
index 6c3d0d3..14085d3 100644
--- a/run_anonymization.py
+++ b/run_anonymization.py
@@ -28,6 +28,6 @@
     else:
         devices.append(torch.device('cpu'))
 
-    pipeline = PIPELINES[config['pipeline']](config=config, force_compute_all=args.force_compute, devices=devices)
-    pipeline.run_anonymization_pipeline(datasets)
-
+    with torch.no_grad():
+        pipeline = PIPELINES[config['pipeline']](config=config, force_compute=args.force_compute, devices=devices)
+        pipeline.run_anonymization_pipeline(datasets)

From 27493faedfc3ecd895adec1a29cfc41c4e1fa913 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 22:18:27 +0100
Subject: [PATCH 24/33] Fix minor bug 'cycle' undefined

---
 anonymization/modules/text/speech_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py
index 1a14f45..0b1887e 100644
--- a/anonymization/modules/text/speech_recognition.py
+++ b/anonymization/modules/text/speech_recognition.py
@@ -2,7 +2,7 @@
 from tqdm.contrib.concurrent import process_map
 import time
 from torch.multiprocessing import set_start_method
-from itertools import repeat
+from itertools import cycle, repeat
 import numpy as np
 from pathlib import Path
 

From f0eec769d6cc4ef87254688a22abe0e6f4f77568 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 22:20:42 +0100
Subject: [PATCH 25/33] Fix spurious 'n_processes'

---
 .../modules/speaker_embeddings/speaker_extraction.py          | 4 ++--
 anonymization/modules/text/speech_recognition.py              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py
index 38faa49..8af9f7a 100644
--- a/anonymization/modules/speaker_embeddings/speaker_extraction.py
+++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py
@@ -5,7 +5,7 @@
 from tqdm.contrib.concurrent import process_map
 import time
 from torch.multiprocessing import set_start_method
-from itertools import repeat
+from itertools import repeat, cycle
 import numpy as np
 
 from .extraction.embedding_methods import SpeechBrainVectors, StyleEmbeddings
@@ -44,7 +44,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode
             'model_path': self.embed_model_path,
         }
 
-        self.extractors = [create_extractors(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(n_processes*len(devices)))]
+        self.extractors = [create_extractors(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(len(devices)))]
 
     def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None):
         dataset_name = dataset_name if dataset_name is not None else dataset_path.name
diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py
index 0b1887e..d186a9d 100644
--- a/anonymization/modules/text/speech_recognition.py
+++ b/anonymization/modules/text/speech_recognition.py
@@ -33,7 +33,7 @@ def __init__(self, devices, settings, results_dir=None, save_intermediate=True,
             if self.save_intermediate:
                 raise ValueError('Results dir must be specified in parameters or settings!')
 
-        self.asr_models = [create_model_instance(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(n_processes*len(devices)))]
+        self.asr_models = [create_model_instance(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(len(devices)))]
         self.is_phones = (self.asr_models[0].output == 'phones')
 
     def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None):

From 28e6c580a700bacd2537455e821491dc7ac6505c Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Tue, 19 Dec 2023 22:25:30 +0100
Subject: [PATCH 26/33] Fix speech_recognition

---
 anonymization/modules/text/speech_recognition.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py
index d186a9d..3635426 100644
--- a/anonymization/modules/text/speech_recognition.py
+++ b/anonymization/modules/text/speech_recognition.py
@@ -71,7 +71,7 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None)
             start = time.time()
 
             if self.n_processes == 1:
-                new_texts = [recognition_job([utterances, self.asr_model,
+                new_texts = [recognition_job([utterances, self.asr_models[0],
                                              dataset_results_dir, 0, self.devices[0], self.model_hparams, None,
                                              save_intermediate])]
             else:
@@ -79,7 +79,7 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None)
                 indices = np.array_split(np.arange(len(utterances)), self.n_processes)
                 utterance_jobs = [[utterances[ind] for ind in chunk] for chunk in indices]
                 # multiprocessing
-                job_params = zip(utterance_jobs, repeat(self.asr_model), repeat(dataset_results_dir), sleeps,
+                job_params = zip(utterance_jobs, repeat(self.asr_models), repeat(dataset_results_dir), sleeps,
                                  self.devices, repeat(self.model_hparams), list(range(self.n_processes)),
                                  repeat(save_intermediate))
                 new_texts = process_map(recognition_job, job_params, max_workers=self.n_processes)
@@ -127,9 +127,6 @@ def recognition_job(data):
     add_suffix = f'_{job_id}' if job_id is not None else None
     job_id = job_id or 0
 
-    if asr_model is None:
-        asr_model = create_model_instance(hparams=model_hparams, device=device)
-
     texts = Text(is_phones=(asr_model.output == 'phones'))
     i = 0
     for utt, spk, wav_path in tqdm(utterances, desc=f'Job {job_id}', leave=True):

From 33c3dbf96ec731775ac36aa8ecafd664eeb4045f Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Wed, 20 Dec 2023 09:56:43 +0100
Subject: [PATCH 27/33] Fix speaker extraction

---
 anonymization/modules/speaker_embeddings/speaker_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py
index 8af9f7a..9600f1f 100644
--- a/anonymization/modules/speaker_embeddings/speaker_extraction.py
+++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py
@@ -78,7 +78,7 @@ def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None):
                 utts = [x[1] for x in returns]
                 utts = list(np.concatenate(utts))
             else:
-                vectors, utts = extraction_job([wav_scp, self.extractors, 0, self.devices[0], self.model_hparams, 0])
+                vectors, utts = extraction_job([wav_scp, self.extractors[0], 0, self.devices[0], self.model_hparams, 0])
                 vectors = torch.stack(vectors, dim=0)
 
             speakers = [utt2spk[utt] for utt in utts]

From 08c9f4a90864234be405d292b54eb675372ce023 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Wed, 20 Dec 2023 12:32:48 +0100
Subject: [PATCH 28/33] Changes to the environment.yaml and makefile

---
 Makefile         | 8 ++------
 environment.yaml | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 2717fe6..2b43415 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 ###############################
 ## CONFIGURATION
 ###############################
-PHONY: install uninstall pretrained_models espnet
+PHONY: install uninstall pretrained_models
 .ONESHELL:
 
 PROJECT_NAME = voicepat
@@ -17,14 +17,10 @@ endif
 ##@ INSTALLATION
 ###############################
 
-install: $(ENV_NAME) espnet ## performs the installation. Currently the only step is to install the conda environment
-	@git clone https://github.com/egaznep/sctk
-	@conda build sctk
-	@conda install --use-local sctk
+install: $(ENV_NAME) ## performs the installation. Currently the only step is to install the conda environment
 
 uninstall:
 	@rm -rf $(ENV_NAME)
-	@rm -rf ESPNet
 	@rm -rf models/
 
 pretrained_models: ## downloads the pretrained models from IMS repositories
diff --git a/environment.yaml b/environment.yaml
index 7d9a926..8818f7c 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -7,7 +7,7 @@ channels:
 dependencies:
   - pip
   - pip:
-    - speechbrain
+    - git+https://github.com/speechbrain/HyperPyYAML.git # pip version has a bug
     - noisereduce
     - pyloudnorm
     - phonemizer

From d0f6f8436c1ddaf685a15b07e571cd5df645086f Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 21 Dec 2023 12:50:19 +0100
Subject: [PATCH 29/33] Fix pretrained model installation script

---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2b43415..4eb9869 100644
--- a/Makefile
+++ b/Makefile
@@ -34,9 +34,10 @@ pretrained_models: ## downloads the pretrained models from IMS repositories
 	@unzip -oq models/asr.zip -d models
 	@unzip -oq models/tts.zip -d models
 	@unzip -oq models/anonymization.zip -d models
-	@unzip -oq models/pre_eval_models.zip -d models
 	@mkdir evaluation/utility/asr/exp
+	@unzip -oq models/pre_eval_models.zip -d evaluation/utility/asr/exp
 	@ln -srf evaluation/utility/asr/exp exp
+	@cp evaluation/privacy/asv/
 	@rm models/*.zip
 
 

From 9df17c7a98727e14d59896fa0ea0a66816265915 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Thu, 21 Dec 2023 15:10:24 +0100
Subject: [PATCH 30/33] Fixes to the config files - Add missing !PLACEHOLDER
 tags - Define two dataset configs, one for anonymization, one for evaluation
 (vctk all/common/diff requires separate treatment)

---
 configs/anon_ims_sttts_pc.yaml                |  3 +-
 ...ml => datasets_vpc2022_official_anon.yaml} |  0
 configs/datasets_vpc2022_official_eval.yaml   | 30 +++++++++++++++
 configs/eval_gvd_both.yaml                    | 32 +---------------
 configs/eval_post_ecapa_cos_ft.yaml           | 34 +----------------
 configs/eval_post_ecapa_cos_scratch.yaml      | 32 +---------------
 configs/eval_post_xvector_plda_scratch.yaml   | 32 +---------------
 configs/eval_pre_ecapa_cos.yaml               | 10 ++---
 configs/eval_pre_ecapa_plda.yaml              | 38 ++-----------------
 configs/eval_pre_xvector_cos.yaml             | 32 +---------------
 configs/eval_pre_xvector_plda.yaml            | 32 +---------------
 11 files changed, 48 insertions(+), 227 deletions(-)
 rename configs/{datasets_vpc2022_official.yaml => datasets_vpc2022_official_anon.yaml} (100%)
 create mode 100644 configs/datasets_vpc2022_official_eval.yaml

diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml
index 87760af..f60f8ff 100644
--- a/configs/anon_ims_sttts_pc.yaml
+++ b/configs/anon_ims_sttts_pc.yaml
@@ -8,7 +8,8 @@ models_dir:  !PLACEHOLDER  # TODO adjust path, e.g. <root_dir>/models
 vectors_dir: !ref <root_dir>/results/original_speaker_embeddings
 
 force_compute_all: false
-datasets: !include:datasets_vpc2022_official.yaml
+save_intermediate: true
+datasets: !include:datasets_vpc2022_official_anon.yaml
 
 pipeline: sttts
 
diff --git a/configs/datasets_vpc2022_official.yaml b/configs/datasets_vpc2022_official_anon.yaml
similarity index 100%
rename from configs/datasets_vpc2022_official.yaml
rename to configs/datasets_vpc2022_official_anon.yaml
diff --git a/configs/datasets_vpc2022_official_eval.yaml b/configs/datasets_vpc2022_official_eval.yaml
new file mode 100644
index 0000000..cd69244
--- /dev/null
+++ b/configs/datasets_vpc2022_official_eval.yaml
@@ -0,0 +1,30 @@
+  - name: libri_dev
+    data: libri
+    set: dev
+    enrolls: [enrolls]
+    trials: [trials_f, trials_m]
+  - name: libri_test
+    data: libri
+    set: test
+    enrolls: [enrolls]
+    trials: [trials_f, trials_m]
+  - name: vctk_dev
+    data: vctk
+    set: dev
+    enrolls: [enrolls]
+    trials: [trials_f, trials_m]
+  - name: vctk_dev_common
+    data: vctk
+    set: dev
+    enrolls: [enrolls]
+    trials: [trials_f_common, trials_m_common]
+  - name: vctk_test
+    data: vctk
+    set: test
+    enrolls: [enrolls]
+    trials: [trials_f, trials_m]
+  - name: vctk_test_common
+    data: vctk
+    set: test
+    enrolls: [enrolls]
+    trials: [trials_f_common, trials_m_common]
\ No newline at end of file
diff --git a/configs/eval_gvd_both.yaml b/configs/eval_gvd_both.yaml
index c0b8b91..a8e1f00 100644
--- a/configs/eval_gvd_both.yaml
+++ b/configs/eval_gvd_both.yaml
@@ -1,37 +1,7 @@
 root_dir: .
 exp_dir: !ref <root_dir>/exp
 
-datasets:
-  - name: libri_dev
-    data: libri
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: libri_test
-    data: libri
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev_common
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
-  - name: vctk_test
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_test_common
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
+datasets: !include:datasets_vpc2022_official_eval.yaml
 
 eval_steps:  # all metrics in this list will be computed in the evaluation. Remove entry to skip
   utility:
diff --git a/configs/eval_post_ecapa_cos_ft.yaml b/configs/eval_post_ecapa_cos_ft.yaml
index 610d80a..d6dc406 100644
--- a/configs/eval_post_ecapa_cos_ft.yaml
+++ b/configs/eval_post_ecapa_cos_ft.yaml
@@ -1,37 +1,7 @@
 root_dir: .
 exp_dir: !ref <root_dir>/exp
 
-datasets:
-  - name: libri_dev
-    data: libri
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: libri_test
-    data: libri
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev_common
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
-  - name: vctk_test
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_test_common
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
+datasets: !include:datasets_vpc2022_official_eval.yaml
 
 eval_steps:  # all metrics in this list will be computed in the evaluation. Remove entry to skip
   privacy:
@@ -41,7 +11,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: TODO  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
diff --git a/configs/eval_post_ecapa_cos_scratch.yaml b/configs/eval_post_ecapa_cos_scratch.yaml
index 05aa510..a3a60a7 100644
--- a/configs/eval_post_ecapa_cos_scratch.yaml
+++ b/configs/eval_post_ecapa_cos_scratch.yaml
@@ -1,37 +1,7 @@
 root_dir: .
 exp_dir: !ref <root_dir>/exp
 
-datasets:
-  - name: libri_dev
-    data: libri
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: libri_test
-    data: libri
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev_common
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
-  - name: vctk_test
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_test_common
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
+datasets: !include:datasets_vpc2022_official_eval.yaml
 
 eval_steps:  # all metrics in this list will be computed in the evaluation. Remove entry to skip
   privacy:
diff --git a/configs/eval_post_xvector_plda_scratch.yaml b/configs/eval_post_xvector_plda_scratch.yaml
index 9326102..61b89e6 100644
--- a/configs/eval_post_xvector_plda_scratch.yaml
+++ b/configs/eval_post_xvector_plda_scratch.yaml
@@ -1,37 +1,7 @@
 root_dir: .
 exp_dir: !ref <root_dir>/exp
 
-datasets:
-  - name: libri_dev
-    data: libri
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: libri_test
-    data: libri
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev_common
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
-  - name: vctk_test
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_test_common
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
+datasets: !include:datasets_vpc2022_official_eval.yaml
 
 eval_steps:  # all metrics in this list will be computed in the evaluation. Remove entry to skip
   privacy:
diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml
index f81a568..672224e 100644
--- a/configs/eval_pre_ecapa_cos.yaml
+++ b/configs/eval_pre_ecapa_cos.yaml
@@ -1,17 +1,17 @@
 root_dir: .
 exp_dir: !ref <root_dir>/exp
 
-datasets: !include:datasets_vpc2022_official.yaml
+datasets: !include:datasets_vpc2022_official_eval.yaml
 
 eval_steps:  # all metrics in this list will be computed in the evaluation. Remove entry to skip
   privacy:
     - asv
   utility:
     - asr
-    - gvd
+    # - gvd
 
-anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+anon_data_suffix: res  # suffix for dataset to signal that it is anonymized
+eval_data_dir: results/formatted_data/26-11-23_21:40/  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
@@ -42,7 +42,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: !PLACEHOLDER  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: ~/Projects/2022-voiceprivacychallenge/baseline/corpora/  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_pre  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/configs/eval_pre_ecapa_plda.yaml b/configs/eval_pre_ecapa_plda.yaml
index f27d44a..90363dd 100644
--- a/configs/eval_pre_ecapa_plda.yaml
+++ b/configs/eval_pre_ecapa_plda.yaml
@@ -1,37 +1,7 @@
 root_dir: .
 exp_dir: !ref <root_dir>/exp
 
-datasets:
-  - name: libri_dev
-    data: libri
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: libri_test
-    data: libri
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev_common
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
-  - name: vctk_test
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_test_common
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
+datasets: !include:datasets_vpc2022_official_eval.yaml
 
 eval_steps:  # all metrics in this list will be computed in the evaluation. Remove entry to skip
   privacy:
@@ -41,7 +11,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - gvd
 
 anon_data_suffix: anon  # suffix for dataset to signal that it is anonymized
-eval_data_dir: !PLACEHOLDER  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
+eval_data_dir: results/formatted_data/26-11-23_21:40/  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.
 
 privacy:
   asv:
@@ -51,7 +21,7 @@ privacy:
     training:
       anon: false  # true or false, depending on whether the training data for the ASV is anonymized or original
       train_data_dir: !ref <utility[asr][libri_dir]>/LibriSpeech/train-clean-360  # path to original or anonymized training data for ASV
-      train_config: evaluation/privacy/asv_train/hparams/train_ecapa_tdnn_small.yaml
+      train_config: evaluation/privacy/asv/asv_train/hparams/train_ecapa_tdnn_small.yaml
       finetuning: false # true (ft) or false (scratch)
       pretrained_model: null  # path to pretrained model, only used for finetuning
       lr: 0.01
@@ -72,7 +42,7 @@ privacy:
 
 utility:
   asr:
-    libri_dir: !PLACEHOLDER  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
+    libri_dir: ~/Projects/2022-voiceprivacychallenge/baseline/corpora/  # path to parent dir of original LibriSpeech for data preparation, needs the structure <libri_dir>/LibriSpeech/LICENSE.TXT etc.
     model_name: asr_pre  # name for ASR model
     model_dir: !ref <exp_dir>/<utility[asr][model_name]>  # path to existing ASR model or output for trained ASR model
 
diff --git a/configs/eval_pre_xvector_cos.yaml b/configs/eval_pre_xvector_cos.yaml
index 1c9fc4f..a835ff6 100644
--- a/configs/eval_pre_xvector_cos.yaml
+++ b/configs/eval_pre_xvector_cos.yaml
@@ -1,37 +1,7 @@
 root_dir: .
 exp_dir: !ref <root_dir>/exp
 
-datasets:
-  - name: libri_dev
-    data: libri
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: libri_test
-    data: libri
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev_common
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
-  - name: vctk_test
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_test_common
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
+datasets: !include:datasets_vpc2022_official_eval.yaml
 
 eval_steps:  # all metrics in this list will be computed in the evaluation. Remove entry to skip
   privacy:
diff --git a/configs/eval_pre_xvector_plda.yaml b/configs/eval_pre_xvector_plda.yaml
index c7e3d04..8ec1b9f 100644
--- a/configs/eval_pre_xvector_plda.yaml
+++ b/configs/eval_pre_xvector_plda.yaml
@@ -1,37 +1,7 @@
 root_dir: .
 exp_dir: !ref <root_dir>/exp
 
-datasets:
-  - name: libri_dev
-    data: libri
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: libri_test
-    data: libri
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_dev_common
-    data: vctk
-    set: dev
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
-  - name: vctk_test
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f, trials_m]
-  - name: vctk_test_common
-    data: vctk
-    set: test
-    enrolls: [enrolls]
-    trials: [trials_f_common, trials_m_common]
+datasets: !include:datasets_vpc2022_official_eval.yaml
 
 eval_steps:  # all metrics in this list will be computed in the evaluation. Remove entry to skip
   privacy:

From 8b098729bbc2e4aa5a766d8f949bf1b12e422c2c Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Fri, 22 Dec 2023 10:13:10 +0100
Subject: [PATCH 31/33] Switch to `logging` for print statements

---
 .../extraction/ims_prosody_extraction.py      |  4 +-
 .../modules/prosody/prosody_extraction.py     | 10 +++--
 .../anonymization/gan_anon.py                 |  8 ++--
 .../anonymization/pool_anon.py                |  7 ++-
 .../anonymization/random_anon.py              |  4 +-
 .../anonymization/utils/plda_model.py         |  6 ++-
 .../speaker_anonymization.py                  |  8 ++--
 .../speaker_embeddings/speaker_extraction.py  |  9 ++--
 .../modules/text/speech_recognition.py        | 11 ++---
 .../InferenceInterfaces/AnonFastSpeech2.py    |  4 +-
 .../modules/tts/IMSToucan/Utility/utils.py    |  7 +--
 .../modules/tts/IMSToucan/UtteranceCloner.py  |  4 +-
 anonymization/modules/tts/ims_tts.py          |  4 +-
 anonymization/modules/tts/speech_synthesis.py | 11 ++---
 anonymization/pipelines/sttts_pipeline.py     |  7 ++-
 evaluation/privacy/asv/asv.py                 |  6 ++-
 .../privacy/asv/asv_train/libri_prepare.py    | 20 ++++-----
 evaluation/privacy/asv/metrics/cllr.py        |  8 ++--
 evaluation/privacy/asv/metrics/linkability.py |  5 ++-
 .../privacy/asv/metrics/utils/zebra_plots.py  | 11 ++---
 .../asr/pyscripts/utils/plot_sinc_filters.py  | 20 +++++----
 .../utility/voice_distinctiveness/deid_gvd.py |  4 +-
 run_anonymization.py                          |  3 ++
 run_evaluation.py                             | 43 ++++++++++---------
 utils/data_io.py                              |  4 +-
 25 files changed, 136 insertions(+), 92 deletions(-)

diff --git a/anonymization/modules/prosody/extraction/ims_prosody_extraction.py b/anonymization/modules/prosody/extraction/ims_prosody_extraction.py
index cc700af..7aad79e 100644
--- a/anonymization/modules/prosody/extraction/ims_prosody_extraction.py
+++ b/anonymization/modules/prosody/extraction/ims_prosody_extraction.py
@@ -1,3 +1,4 @@
+import logging
 import torch
 torch.set_num_threads(1)
 
@@ -11,6 +12,7 @@
 from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator
 from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Parselmouth
 
+logger = logging.getLogger(__name__)
 
 class ImsProsodyExtractor:
 
@@ -54,7 +56,7 @@ def extract_prosody(self,
         try:
             norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave)
         except ValueError:
-            print('Something went wrong, the reference wave might be too short.')
+            logger.error('Something went wrong, the reference wave might be too short.')
             raise RuntimeError
 
         with torch.inference_mode():
diff --git a/anonymization/modules/prosody/prosody_extraction.py b/anonymization/modules/prosody/prosody_extraction.py
index 478f574..dcc6c2d 100644
--- a/anonymization/modules/prosody/prosody_extraction.py
+++ b/anonymization/modules/prosody/prosody_extraction.py
@@ -1,3 +1,4 @@
+import logging
 import torch
 torch.set_num_threads(1)
 
@@ -8,6 +9,7 @@
 from .extraction import *
 from utils import read_kaldi_format
 
+logger = logging.getLogger(__name__)
 
 class ProsodyExtraction:
 
@@ -47,7 +49,7 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None):
             wav_scp = {utt: wav_scp[utt] for utt in unprocessed_utts}
 
         if wav_scp:
-            print(f'Extract prosody for {len(wav_scp)} of {len(wav_scp) + len(data_prosody)} utterances')
+            logger.info(f'Extract prosody for {len(wav_scp)} of {len(wav_scp) + len(data_prosody)} utterances')
             data_prosody.new = True
             i = 0
             for utt, wav_path in tqdm(wav_scp.items()):
@@ -56,7 +58,7 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None):
                     utt_prosody = self.extractor.extract_prosody(transcript=text, ref_audio_path=wav_path,
                                                                  input_is_phones=text_is_phones)
                 except IndexError:
-                    print(f'Index Error for {utt}')
+                    logger.warn(f'IndexError for {utt}')
                     continue
                 duration, pitch, energy, start_silence, end_silence = utt_prosody
                 data_prosody.add_instance(utterance=utt, duration=duration, pitch=pitch, energy=energy,
@@ -69,8 +71,8 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None):
                 data_prosody.save_prosody(dataset_results_dir)
 
         elif len(data_prosody.utterances) > 0:
-            print('No prosody extraction necessary; load stored values instead...')
+            logger.info('No prosody extraction necessary; load stored values instead...')
         else:
-            print(f'No utterances could be found in {dataset_path}!')
+            logger.warn(f'No utterances could be found in {dataset_path}!')
 
         return data_prosody
diff --git a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
index 0fb5757..5d1ab84 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py
@@ -1,3 +1,4 @@
+import logging
 from pathlib import Path
 import torch
 import numpy as np
@@ -10,6 +11,7 @@
 from ..speaker_embeddings import SpeakerEmbeddings
 from .utils.WGAN import EmbeddingsGenerator
 
+logger = logging.getLogger(__name__)
 
 class GANAnonymizer(BaseAnonymizer):
     """
@@ -84,9 +86,9 @@ def anonymize_embeddings(
                 or 'utt' for utterance level).
         """
         if emb_level == "spk":
-            print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
+            logger.info(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
         elif emb_level == "utt":
-            print(f"Anonymize embeddings of {len(speaker_embeddings)} utterances...")
+            logger.info(f"Anonymize embeddings of {len(speaker_embeddings)} utterances...")
 
         identifiers = []
         speakers = []
@@ -117,7 +119,7 @@ def anonymize_embeddings(
         return anon_embeddings
 
     def _generate_artificial_embeddings(self, gan_model_path: Path, n: int):
-        print(f"Generate {n} artificial speaker embeddings...")
+        logger.info(f"Generate {n} artificial speaker embeddings...")
         generator = EmbeddingsGenerator(gan_path=gan_model_path, device=self.device)
         gan_vectors = generator.generate_embeddings(n=n)
         unused_indices = np.arange(len(gan_vectors))
diff --git a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
index 3a21db1..c24151f 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
@@ -1,3 +1,4 @@
+import logging
 from pathlib import Path
 import numpy as np
 import torch
@@ -15,6 +16,8 @@
 from ..speaker_embeddings import SpeakerEmbeddings
 from utils import transform_path
 
+logger = logging.getLogger(__name__)
+
 REVERSED_GENDERS = {
     "m": "f", 
     "f": "m"
@@ -144,7 +147,7 @@ def __init__(
         self.stats_per_dim_path = stats_per_dim_path or Path()
 
     def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_path):
-        print(pool_data_dir)
+        logger.debug(pool_data_dir)
         if pool_vec_path.exists():
             pool_embeddings = SpeakerEmbeddings(
                 vec_type=self.vec_type, emb_level="spk", device=self.device
@@ -168,7 +171,7 @@ def anonymize_embeddings(self, speaker_embeddings: torch.Tensor, emb_level: str
             vectors_a=self.pool_embeddings.vectors, vectors_b=speaker_embeddings.vectors
         )
 
-        print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
+        logging.info(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
         identifiers = []
         speakers = []
         anon_vectors = []
diff --git a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py
index 98ffc84..f0d2463 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from pathlib import Path
 import torch
 from os import PathLike
@@ -8,6 +9,7 @@
 from .base_anon import BaseAnonymizer
 from ..speaker_embeddings import SpeakerEmbeddings
 
+logger = logging.getLogger(__name__)
 
 class RandomAnonymizer(BaseAnonymizer):
     """
@@ -73,7 +75,7 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level="spk"):
                 utterance level).
         """
         if self.scaling_ranges:
-            print("Anonymize vectors in scale!")
+            logger.debug("Anonymize vectors in scale!")
             return self._anonymize_data_in_scale(speaker_embeddings)
         else:
             identifiers = []
diff --git a/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py b/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py
index 3abf1fa..18482de 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py
@@ -1,9 +1,11 @@
 # This code is based on the descriptions in https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/PLDA_LDA.py
+import logging
 from pathlib import Path
 from speechbrain.processing.PLDA_LDA import PLDA, StatObject_SB, Ndx, fast_PLDA_scoring
 import numpy as np
 import torch
 
+logger = logging.getLogger(__name__)
 class PLDAModel:
 
     def __init__(self, train_embeddings, results_path: Path=None, save_plda=True):
@@ -64,13 +66,13 @@ def _train_plda(self, train_embeddings):
         vectors = train_embeddings.vectors.to(torch.float64)
 
         modelset = np.array([f'md{speaker}' for speaker in train_embeddings.original_speakers], dtype="|O")
-        print(len(modelset), len(set(modelset)))
+        logger.debug(len(modelset), len(set(modelset)))
         segset, s, stat0 = self._get_vector_stats(vectors, sg_tag='sg', utt_ids=train_embeddings.get_utt_list())
 
         xvectors_stat = StatObject_SB(modelset=modelset, segset=segset, start=s, stop=s, stat0=stat0,
                                       stat1=vectors.cpu().numpy())
 
-        print(vectors.shape)
+        logger.debug(vectors.shape)
 
         plda = PLDA(rank_f=100)
         plda.plda(xvectors_stat)
diff --git a/anonymization/modules/speaker_embeddings/speaker_anonymization.py b/anonymization/modules/speaker_embeddings/speaker_anonymization.py
index 5c82113..f156eaf 100644
--- a/anonymization/modules/speaker_embeddings/speaker_anonymization.py
+++ b/anonymization/modules/speaker_embeddings/speaker_anonymization.py
@@ -1,8 +1,10 @@
+import logging
 from pathlib import Path
 
 from .anonymization.base_anon import BaseAnonymizer
 from .speaker_embeddings import SpeakerEmbeddings
 
+logger = logging.getLogger(__name__)
 
 class SpeakerAnonymization:
 
@@ -38,14 +40,14 @@ def anonymize_embeddings(self, speaker_embeddings, dataset_name):
                 self.force_compute:
             # if there are already anonymized speaker embeddings from this model and the computation is not forced,
             # simply load them
-            print('No computation of anonymized embeddings necessary; load existing anonymized speaker embeddings '
+            logger.info('No computation of anonymized embeddings necessary; load existing anonymized speaker embeddings '
                   'instead...')
             anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level=self.emb_level, device=self.device)
             anon_embeddings.load_vectors(dataset_results_dir)
             return anon_embeddings
         else:
             # otherwise, create new anonymized speaker embeddings
-            print('Anonymize speaker embeddings...')
+            logger.info('Anonymize speaker embeddings...')
             anon_embeddings = self.anonymizer.anonymize_embeddings(speaker_embeddings, emb_level=self.emb_level)
 
             if self.save_intermediate:
@@ -58,5 +60,5 @@ def _load_anonymizer(self, settings: dict):
             'The anonymizer must be an instance of BaseAnonymizer, or a ' \
             f'subclass of it, but received an instance of {type(anon_method)}'
             
-        print(f'Model type of anonymizer: {type(anon_method).__name__}')
+        logger.info(f'Model type of anonymizer: {type(anon_method).__name__}')
         return anon_method
diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py
index 9600f1f..507781f 100644
--- a/anonymization/modules/speaker_embeddings/speaker_extraction.py
+++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py
@@ -1,3 +1,4 @@
+import logging
 from tqdm import tqdm
 from pathlib import Path
 import torch
@@ -14,7 +15,7 @@
 from utils import read_kaldi_format
 
 set_start_method('spawn', force=True)
-
+logger = logging.getLogger(__name__)
 
 class SpeakerExtraction:
 
@@ -57,12 +58,12 @@ def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None):
         speaker_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='utt', device=self.devices[0])
 
         if (dataset_results_dir / 'speaker_vectors.pt').exists() and not self.force_compute:
-            print('No speaker extraction necessary; load existing embeddings instead...')
+            logger.info('No speaker extraction necessary; load existing embeddings instead...')
             speaker_embeddings.load_vectors(dataset_results_dir)
             # assume the loaded vectors are computed according to the setting in config
             speaker_embeddings.emb_level = emb_level
         else:
-            print(f'Extract embeddings of {len(wav_scp)} utterances')
+            logger.info(f'Extract embeddings of {len(wav_scp)} utterances')
             speaker_embeddings.new = True
 
             if self.n_processes > 1:
@@ -126,7 +127,7 @@ def extraction_job(data):
         try:
             spk_embs = [extractor.extract_vector(audio=norm_wave, sr=fs) for extractor in speaker_extractors]
         except RuntimeError as e:
-            print(f'Runtime error: {utt}, {signal.shape}, {norm_wave.shape}')
+            logger.warn(f'Runtime error: {utt}, {signal.shape}, {norm_wave.shape}')
             continue
 
         if len(spk_embs) == 1:
diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py
index 3635426..8992b8c 100644
--- a/anonymization/modules/text/speech_recognition.py
+++ b/anonymization/modules/text/speech_recognition.py
@@ -1,6 +1,7 @@
 from tqdm import tqdm
 from tqdm.contrib.concurrent import process_map
 import time
+import logging
 from torch.multiprocessing import set_start_method
 from itertools import cycle, repeat
 import numpy as np
@@ -11,7 +12,7 @@
 from utils import read_kaldi_format
 
 set_start_method('spawn', force=True)
-
+logger = logging.getLogger(__name__)
 
 class SpeechRecognition:
 
@@ -49,13 +50,13 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None)
             texts.load_text(in_dir=dataset_results_dir)
 
         if len(texts) == len(utt2spk):
-            print('No speech recognition necessary; load existing text instead...')
+            logger.info('No speech recognition necessary; load existing text instead...')
         else:
             if len(texts) > 0:
-                print(f'No speech recognition necessary for {len(texts)} of {len(utt2spk)} utterances')
+                logger.info(f'No speech recognition necessary for {len(texts)} of {len(utt2spk)} utterances')
             # otherwise, recognize the speech
             dataset_results_dir.mkdir(exist_ok=True, parents=True)
-            print(f'Recognize speech of {len(utt2spk)} utterances...')
+            logger.info(f'Recognize speech of {len(utt2spk)} utterances...')
             wav_scp = read_kaldi_format(dataset_path / 'wav.scp')
 
             utterances = []
@@ -86,7 +87,7 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None)
 
             end = time.time()
             total_time = round(end - start, 2)
-            print(f'Total time for speech recognition: {total_time} seconds ({round(total_time / 60, 2)} minutes / '
+            logger.info(f'Total time for speech recognition: {total_time} seconds ({round(total_time / 60, 2)} minutes / '
                   f'{round(total_time / 60 / 60, 2)} hours)')
             texts = self._combine_texts(main_text_instance=texts, additional_text_instances=new_texts)
 
diff --git a/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py
index 8ecd359..a2ba7c3 100644
--- a/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py
+++ b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py
@@ -1,5 +1,6 @@
 import itertools
 import os
+import logging
 
 import librosa.display as lbd
 import matplotlib.pyplot as plt
@@ -15,6 +16,7 @@
 from ..Preprocessing.TextFrontend import get_language_id
 from ..TrainingInterfaces.Spectrogram_to_Embedding.StyleEmbedding import StyleEmbedding
 
+logger = logging.getLogger(__name__)
 
 class AnonFastSpeech2(torch.nn.Module):
 
@@ -174,7 +176,7 @@ def read_to_file(self,
         for (text, durations, pitch, energy) in itertools.zip_longest(text_list, dur_list, pitch_list, energy_list):
             if text.strip() != "":
                 if not silent:
-                    print("Now synthesizing: {}".format(text))
+                    logger.info("Now synthesizing: {}".format(text))
                 if wav is None:
                     if durations is not None:
                         durations = durations.to(self.device)
diff --git a/anonymization/modules/tts/IMSToucan/Utility/utils.py b/anonymization/modules/tts/IMSToucan/Utility/utils.py
index 5fa60eb..9809d58 100644
--- a/anonymization/modules/tts/IMSToucan/Utility/utils.py
+++ b/anonymization/modules/tts/IMSToucan/Utility/utils.py
@@ -4,9 +4,10 @@
 
 import os
 from abc import ABC
-
+import logging
 import torch
 
+logger = logging.getLogger(__name__)
 
 def cumsum_durations(durations):
     out = [0]
@@ -39,11 +40,11 @@ def get_most_recent_checkpoint(checkpoint_dir, verbose=True):
         if el.endswith(".pt") and el != "best.pt":
             checkpoint_list.append(int(el.split(".")[0].split("_")[1]))
     if len(checkpoint_list) == 0:
-        print("No previous checkpoints found, cannot reload.")
+        logger.info("No previous checkpoints found, cannot reload.")
         return None
     checkpoint_list.sort(reverse=True)
     if verbose:
-        print("Reloading checkpoint_{}.pt".format(checkpoint_list[0]))
+        logger.info("Reloading checkpoint_{}.pt".format(checkpoint_list[0]))
     return os.path.join(checkpoint_dir, "checkpoint_{}.pt".format(checkpoint_list[0]))
 
 
diff --git a/anonymization/modules/tts/IMSToucan/UtteranceCloner.py b/anonymization/modules/tts/IMSToucan/UtteranceCloner.py
index 6d05e8e..df02d78 100644
--- a/anonymization/modules/tts/IMSToucan/UtteranceCloner.py
+++ b/anonymization/modules/tts/IMSToucan/UtteranceCloner.py
@@ -1,3 +1,4 @@
+import logging
 import soundfile as sf
 import torch
 from torch.optim import SGD
@@ -10,6 +11,7 @@
 from .TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator
 from .TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Parselmouth
 
+logger = logging.getLogger(__name__)
 
 class UtteranceCloner:
 
@@ -59,7 +61,7 @@ def extract_prosody(self,
         try:
             norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave)
         except ValueError:
-            print('Something went wrong, the reference wave might be too short.')
+            logger.error('Something went wrong, the reference wave might be too short.')
             raise RuntimeError
 
         with torch.inference_mode():
diff --git a/anonymization/modules/tts/ims_tts.py b/anonymization/modules/tts/ims_tts.py
index a5a4cc6..edcabb1 100644
--- a/anonymization/modules/tts/ims_tts.py
+++ b/anonymization/modules/tts/ims_tts.py
@@ -1,8 +1,10 @@
 import torch
 import resampy
+import logging
 
 from .IMSToucan.InferenceInterfaces.AnonFastSpeech2 import AnonFastSpeech2
 
+logger = logging.getLogger(__name__)
 
 class ImsTTS:
 
@@ -36,7 +38,7 @@ def read_text(self, text, speaker_embedding, text_is_phones=True, duration=None,
             if i > 30:
                 break
         if i > 0:
-            print(f'Synthesized utt in {i} takes')
+            logger.info(f'Synthesized utt in {i} takes')
 
         # start and end silence are computed for 16000, so we have to adapt this to different output sr
         factor = self.output_sr // 16000
diff --git a/anonymization/modules/tts/speech_synthesis.py b/anonymization/modules/tts/speech_synthesis.py
index 911e68c..17d996a 100644
--- a/anonymization/modules/tts/speech_synthesis.py
+++ b/anonymization/modules/tts/speech_synthesis.py
@@ -1,6 +1,7 @@
 from tqdm import tqdm
 import soundfile
 import time
+import logging
 from torch.multiprocessing import Pool, set_start_method
 from itertools import repeat
 
@@ -8,7 +9,7 @@
 from utils import create_clean_dir
 
 set_start_method('spawn', force=True)
-
+logger = logging.getLogger(__name__)
 
 class SpeechSynthesis:
 
@@ -52,7 +53,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non
                                         if wav_file.stem in texts.utterances}
 
             if len(already_synthesized_utts):
-                print(f'No synthesis necessary for {len(already_synthesized_utts)} of {len(texts)} utterances...')
+                logger.info(f'No synthesis necessary for {len(already_synthesized_utts)} of {len(texts)} utterances...')
                 texts.remove_instances(list(already_synthesized_utts.keys()))
                 if self.save_output:
                     wavs = already_synthesized_utts
@@ -63,7 +64,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non
                         wavs[utt] = wav
 
         if texts:
-            print(f'Synthesize {len(texts)} utterances...')
+            logger.info(f'Synthesize {len(texts)} utterances...')
             if self.force_compute or not dataset_results_dir.exists():
                 create_clean_dir(dataset_results_dir)
 
@@ -84,7 +85,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non
                             utt_prosody_dict = {}
                         instances.append((text, utt, speaker_embedding, utt_prosody_dict))
                     except KeyError:
-                        print(f'Key error at {utt}')
+                        logger.warn(f'Key error at {utt}')
                         continue
                 wavs.update(synthesis_job(instances=instances, tts_model=self.tts_models[0],
                                           out_dir=dataset_results_dir, sleep=0, text_is_phones=text_is_phones,
@@ -111,7 +112,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non
                                 utt_prosody_dict = {}
                             job_instances.append((text, utt, speaker_embedding, utt_prosody_dict))
                         except KeyError:
-                            print(f'Key error at {utt}')
+                            logger.warn(f'Key error at {utt}')
                             continue
                     instances.append(job_instances)
 
diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py
index 10f5a1b..9226ace 100644
--- a/anonymization/pipelines/sttts_pipeline.py
+++ b/anonymization/pipelines/sttts_pipeline.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 from datetime import datetime
+import logging
 
 from anonymization.modules import (
     SpeechRecognition,
@@ -12,6 +13,7 @@
 import typing
 from utils import prepare_evaluation_data, save_yaml
 
+logger = logging.getLogger(__name__)
 
 class STTTSPipeline:
     def __init__(self, config: dict, force_compute: bool, devices: list):
@@ -110,7 +112,7 @@ def run_anonymization_pipeline(
         anon_wav_scps = {}
 
         for i, (dataset_name, dataset_path) in enumerate(datasets.items()):
-            print(f"{i + 1}/{len(datasets)}: Processing {dataset_name}...")
+            logger.info(f"{i + 1}/{len(datasets)}: Processing {dataset_name}...")
             # Step 1: Recognize speech, extract speaker embeddings, extract prosody
             texts = self.speech_recognition.recognize_speech(
                 dataset_path=dataset_path, dataset_name=dataset_name
@@ -147,9 +149,10 @@ def run_anonymization_pipeline(
                 emb_level=anon_embeddings.emb_level,
             )
             anon_wav_scps[dataset_name] = wav_scp
-            print("Done")
+            logger.info("Anonymization pipeline completed.")
 
         if prepare_results:
+            logger.info("Preparing results according to the Kaldi format.")
             if self.speaker_anonymization:
                 anon_vectors_path = self.speaker_anonymization.results_dir
             else:
diff --git a/evaluation/privacy/asv/asv.py b/evaluation/privacy/asv/asv.py
index 6792d44..66d8080 100644
--- a/evaluation/privacy/asv/asv.py
+++ b/evaluation/privacy/asv/asv.py
@@ -1,5 +1,6 @@
 # This code is partly based on
 # https://github.com/speechbrain/speechbrain/blob/develop/recipes/VoxCeleb/SpeakerRec/speaker_verification_plda.py
+import logging
 from pathlib import Path
 import torch
 from speechbrain.utils.metric_stats import EER
@@ -10,6 +11,7 @@
 from anonymization.modules.speaker_embeddings import SpeakerExtraction
 from utils import write_table, read_kaldi_format, save_kaldi_format
 
+logger = logging.getLogger(__name__)
 
 class ASV:
 
@@ -138,7 +140,7 @@ def compute_distances(self, enrol_vectors, enrol_ids, test_vectors, test_ids):
             if self.plda_model_dir.exists():
                 self.plda = PLDAModel(train_embeddings=None, results_path=self.plda_model_dir)
             else:
-                print('Train PLDA model...')
+                logger.info('Train PLDA model...')
 
                 plda_data_dir = self.plda_train_data_dir
                 if self.plda_anon:
@@ -146,7 +148,7 @@ def compute_distances(self, enrol_vectors, enrol_ids, test_vectors, test_ids):
                     self.select_data_for_plda(all_data_dir=self.plda_train_data_dir,
                                               selected_data_dir=self.model_dir.parent,
                                               out_dir=plda_data_dir)
-                print(f'Using data under {plda_data_dir}')
+                logger.info(f'Using data under {plda_data_dir}')
 
                 train_dict = self.extractor.extract_speakers(dataset_path=plda_data_dir, emb_level='utt')
                 self.plda = PLDAModel(train_embeddings=train_dict, results_path=self.plda_model_dir)
diff --git a/evaluation/privacy/asv/asv_train/libri_prepare.py b/evaluation/privacy/asv/asv_train/libri_prepare.py
index 65be867..ab1630e 100644
--- a/evaluation/privacy/asv/asv_train/libri_prepare.py
+++ b/evaluation/privacy/asv/asv_train/libri_prepare.py
@@ -180,7 +180,7 @@ def _get_utt_split_lists(
     train_lst = []
     dev_lst = []
 
-    print("Getting file list...")
+    logger.debug("Getting file list...")
     for data_folder in data_folders:
         if anon:
             suffix = 'wav'
@@ -212,14 +212,14 @@ def _get_utt_split_lists(
         selected_spk = {}
         #select the number of speakers
         if num_spk != 'ALL':
-            print("selected %s speakers for training"%num_spk)
+            logger.debug("selected %s speakers for training"%num_spk)
             selected_spks_pure = random.sample(spks_pure,int(num_spk))
             for k,v in spk_files.items():
                 if k.split('-')[0] in selected_spks_pure:
                     selected_spk[k] = v
             #selected_spk = dict(random.sample(spk_files.items(), int(num_spk)))
         elif num_spk == 'ALL':
-            print("selected all speakers for training")
+            logger.debug("selected all speakers for training")
             selected_spk = spk_files
         else:
             sys.exit("invalid $utt_spk value")
@@ -228,7 +228,7 @@ def _get_utt_split_lists(
         if num_utt != 'ALL':
             # select the number of utterances for each speaker-sess-id
             if utt_selected_ways == 'spk-sess':
-                print("selected %s utterances for each selected speaker-sess-id" % num_utt)
+                logger.info("selected %s utterances for each selected speaker-sess-id" % num_utt)
                 for spk in selected_spk:
                     if len(selected_spk[spk]) >= int(num_utt):
                         selected_list.extend(random.sample(selected_spk[spk], int(num_utt)))
@@ -236,7 +236,7 @@ def _get_utt_split_lists(
                         selected_list.extend(selected_spk[spk])
 
             elif utt_selected_ways == 'spk-random':
-                print("randomly selected %s utterances for each selected speaker-id" % num_utt)
+                logger.info("randomly selected %s utterances for each selected speaker-id" % num_utt)
                 selected_spks_pure = {}
                 for k, v in selected_spk.items():
                     spk_pure = k.split('-')[0]
@@ -253,7 +253,7 @@ def _get_utt_split_lists(
                         selected_list.extend(selected_spk[spk])
 
             elif utt_selected_ways == 'spk-diverse-sess':
-                print("diversely selected %s utterances for each selected speaker-id" % num_utt)
+                logger.info("diversely selected %s utterances for each selected speaker-id" % num_utt)
                 selected_spks_pure = {}
                 for k, v in selected_spk.items():
                     spk_pure = k.split('-')[0]
@@ -273,7 +273,7 @@ def _get_utt_split_lists(
 
 
         elif num_utt == 'ALL':
-            print("selected all utterances for each selected speaker")
+            logger.info("selected all utterances for each selected speaker")
 
             for value in selected_spk.values():
                 for v in value:
@@ -297,8 +297,8 @@ def _get_utt_split_lists(
         
         full = f'Full training set:{full_utt}'
         used = f'Used for training:{len(selected_list)}'
-        print(full)
-        print(used)
+        logger.debug(full)
+        logger.debug(used)
 
         split = int(0.01 * split_ratio[0] * len(selected_list))
         train_snts = selected_list[:split]
@@ -417,7 +417,7 @@ def prepare_csv(seg_dur, wav_lst, csv_file, random_segment=False, amp_th=0):
                 ]
                 entry.append(csv_line)
 
-    print(f'Skipped {len(problematic_wavs)} invalid audios')
+    logger.info(f'Skipped {len(problematic_wavs)} invalid audios')
     csv_output = csv_output + entry
 
     # Writing the csv lines
diff --git a/evaluation/privacy/asv/metrics/cllr.py b/evaluation/privacy/asv/metrics/cllr.py
index c11ba2d..40ba800 100644
--- a/evaluation/privacy/asv/metrics/cllr.py
+++ b/evaluation/privacy/asv/metrics/cllr.py
@@ -1,9 +1,11 @@
+import logging
 import numpy as np
 from scipy.special import expit
 
 from .helpers import optimal_llr
 from .utils.io import read_targets_and_nontargets
 
+logger = logging.getLogger(__name__)
 
 def compute_cllr(score_file, key_file, compute_eer=False):
     # Computing Cllr and min Cllr for binary decision classifiers
@@ -15,11 +17,9 @@ def compute_cllr(score_file, key_file, compute_eer=False):
     else:
         cllr_min = min_cllr(tar, non)
 
-    print("Cllr (min/act): %.3f/%.3f" % (cllr_min, cllr_act))
+    logger.info("Cllr (min/act): %.3f/%.3f" % (cllr_min, cllr_act))
     if compute_eer:
-        print("ROCCH-EER: %2.3f%%" % (100*eer))
-
-    print("")
+        logger.info("ROCCH-EER: %2.3f%%" % (100*eer))
 
 
 def cllr(tar_llrs, nontar_llrs):
diff --git a/evaluation/privacy/asv/metrics/linkability.py b/evaluation/privacy/asv/metrics/linkability.py
index 0d0e057..91529e2 100644
--- a/evaluation/privacy/asv/metrics/linkability.py
+++ b/evaluation/privacy/asv/metrics/linkability.py
@@ -1,8 +1,10 @@
+import logging
 import numpy as np
 
 from .utils.visualization import draw_linkability_scores
 from .utils.io import read_targets_and_nontargets
 
+logger = logging.getLogger(__name__)
 
 def compute_linkability(score_file, key_file, omega=1.0, use_draw_scores=False, output_file=None):
     # Computing the global linkability measure for a list of linkage function score
@@ -20,8 +22,7 @@ def compute_linkability(score_file, key_file, omega=1.0, use_draw_scores=False,
             output_file = "linkability_" + score_file
         draw_linkability_scores(mated_scores, non_mated_scores, Dsys, D, bin_centers, bin_edges, str(output_file))
 
-    print("linkability: %f" % (Dsys))
-    print("")
+    logger.info("linkability: %f" % (Dsys))
 
 
 def linkability(mated_scores, non_mated_scores, omega=1):
diff --git a/evaluation/privacy/asv/metrics/utils/zebra_plots.py b/evaluation/privacy/asv/metrics/utils/zebra_plots.py
index 81757d5..a59095a 100644
--- a/evaluation/privacy/asv/metrics/utils/zebra_plots.py
+++ b/evaluation/privacy/asv/metrics/utils/zebra_plots.py
@@ -1,3 +1,4 @@
+import logging
 import numpy as np
 from matplotlib._cm import datad
 import matplotlib.pyplot as mpl
@@ -7,6 +8,7 @@
 from .plo_plots import PriorLogOddsPlots
 from .io import read_targets_and_nontargets
 
+logger = logging.getLogger(__name__)
 
 __author__ = "Andreas Nautsch"
 __email__ = "nautsch@eurecom.fr"
@@ -84,10 +86,9 @@ def zebra_framework(plo_plot, scr_path, key_path, label='ZEBRA profile',
         str_max_abs_llr = '0'
 
     # print outs
-    print('')
-    print("%s" % label)
-    print("Population: %s bit" % str_dece)
-    print("Individual: %s (%s)" % (str_max_abs_llr, cat_tag))
+    logger.info("%s" % label)
+    logger.info("Population: %s bit" % str_dece)
+    logger.info("Individual: %s (%s)" % (str_max_abs_llr, cat_tag))
 
     # Creating log-odds plots
     if color_min is not None:
@@ -99,7 +100,7 @@ def zebra_framework(plo_plot, scr_path, key_path, label='ZEBRA profile',
         # DCF
         if dcf_pot:
             plo_plot.plot_dcf(color_min=color_min, style_min=style_min, color_act=color_act, style_act=style_act)
-            print("1 - min Cllr: %.3f (0 is good)" % plo_plot.get_delta_DCF())
+            logger.info("1 - min Cllr: %.3f (0 is good)" % plo_plot.get_delta_DCF())
 
         plo_plot.add_legend_entry(legend_entry)
 
diff --git a/evaluation/utility/asr/pyscripts/utils/plot_sinc_filters.py b/evaluation/utility/asr/pyscripts/utils/plot_sinc_filters.py
index 6ca071f..8fc34b1 100755
--- a/evaluation/utility/asr/pyscripts/utils/plot_sinc_filters.py
+++ b/evaluation/utility/asr/pyscripts/utils/plot_sinc_filters.py
@@ -12,6 +12,7 @@
 """
 
 import argparse
+import logging
 import sys
 from pathlib import Path
 
@@ -19,6 +20,7 @@
 import numpy as np
 import torch
 
+logger = logging.getLogger(__name__)
 
 def get_parser():
     """Construct the parser."""
@@ -141,7 +143,7 @@ def plot_filtergraph(
     ax.fill_between(x, f_mins, f_maxs, color="green", alpha=0.3)
     ax.legend(loc="upper left", prop={"size": 15})
     plt.savefig(img_path, bbox_inches="tight")
-    print("Plotted %s" % img_path)
+    logger.debug("Plotted %s" % img_path)
 
 
 def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args):
@@ -154,7 +156,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args):
     """
     from espnet2.layers.sinc_conv import SincConv
 
-    print(
+    logger.warn(
         "When plotting filter kernels, make sure the script has the"
         " correct SincConv settings (currently hard-coded)."
     )
@@ -202,7 +204,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args):
             img_name = "filter_pre_kernel_%s.%s" % (str(i).zfill(2), args.filetype)
             img_path = str(args.out_folder / img_name)
             plt.savefig(img_path, bbox_inches="tight")
-            print("Plotted %s" % img_path)
+            logger.debug("Plotted %s" % img_path)
 
             kernel = kernels[i][0]
             plt.clf()
@@ -212,7 +214,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args):
             img_name = "filter_kernel_%s.%s" % (str(i).zfill(2), args.filetype)
             img_path = str(args.out_folder / img_name)
             plt.savefig(img_path, bbox_inches="tight")
-            print("Plotted %s" % img_path)
+            logger.debug("Plotted %s" % img_path)
 
             plt.clf()
             plt.xlabel("kernel index")
@@ -221,7 +223,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args):
             img_name = "filter_kernel_both_%s.%s" % (str(i).zfill(2), args.filetype)
             img_path = str(args.out_folder / img_name)
             plt.savefig(img_path, bbox_inches="tight")
-            print("Plotted %s" % img_path)
+            logger.debug("Plotted %s" % img_path)
 
             y = np.zeros_like(x_f)
             y[F_mins[i] : F_maxs[i]] = 1.0
@@ -230,7 +232,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args):
             img_name = "filter_freq_%s.%s" % (str(i).zfill(2), args.filetype)
             img_path = str(args.out_folder / img_name)
             plt.savefig(img_path, bbox_inches="tight")
-            print("Plotted %s" % img_path)
+            logger.debug("Plotted %s" % img_path)
 
             pre_y = np.zeros_like(x_f)
             pre_y[pre_F_mins[i] : pre_F_maxs[i]] = 1.0
@@ -240,7 +242,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args):
             img_name = "filter_freq_both_%s.%s" % (str(i).zfill(2), args.filetype)
             img_path = args.out_folder / img_name
             plt.savefig(img_path, bbox_inches="tight")
-            print("Plotted %s" % img_path)
+            logger.debug("Plotted %s" % img_path)
 
     plt.clf()
     filters = [32, 71, 113, 126]
@@ -259,7 +261,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args):
     img_path = str(args.out_folder / img_name)
     plt.savefig(img_path, bbox_inches="tight")
     plt.close(fig)
-    print("Plotted %s" % img_path)
+    logger.debug("Plotted %s" % img_path)
 
 
 def plot_filters(indices, filename, F_mins, F_maxs, output_folder):
@@ -282,7 +284,7 @@ def plot_filters(indices, filename, F_mins, F_maxs, output_folder):
         plt.plot(x, y)
     img_path = str(output_folder / filename)
     plt.savefig(img_path, bbox_inches="tight")
-    print("Plotted %s" % img_path)
+    logger.debug("Plotted %s" % img_path)
 
 
 def main(argv):
diff --git a/evaluation/utility/voice_distinctiveness/deid_gvd.py b/evaluation/utility/voice_distinctiveness/deid_gvd.py
index fb2c9df..6733756 100644
--- a/evaluation/utility/voice_distinctiveness/deid_gvd.py
+++ b/evaluation/utility/voice_distinctiveness/deid_gvd.py
@@ -1,3 +1,4 @@
+import logging
 from pathlib import Path
 import numpy as np
 import pandas as pd
@@ -11,6 +12,7 @@
 from evaluation.privacy import ASV
 from evaluation.privacy.asv.metrics.helpers import optimal_llr
 
+logger = logging.getLogger(__name__)
 
 class VoiceDistinctiveness:
 
@@ -98,7 +100,7 @@ def _select_utterances(self, spk2utt_x, spk2utt_y):
             y = [(spk, utt) for spk, utt_list in spk2utt_y.items() for utt in utt_list]
 
         else:
-            print("choose %d utterances for each spk to create trial" % int(self.num_per_spk))
+            logger.info("choose %d utterances for each spk to create trial" % int(self.num_per_spk))
             x = [(spk, utt) for spk, utt_list in spk2utt_x.items()
                  for utt in random.sample(utt_list, k=min(self.num_per_spk, len(utt_list)))]
             y = [(spk, utt) for spk, utt_list in spk2utt_y.items()
diff --git a/run_anonymization.py b/run_anonymization.py
index 14085d3..49cf865 100644
--- a/run_anonymization.py
+++ b/run_anonymization.py
@@ -1,3 +1,4 @@
+import logging
 from pathlib import Path
 from argparse import ArgumentParser
 import torch
@@ -29,5 +30,7 @@
         devices.append(torch.device('cpu'))
 
     with torch.no_grad():
+        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s- %(levelname)s - %(message)s')
+        logging.info(f'Running pipeline: {config["pipeline"]}')
         pipeline = PIPELINES[config['pipeline']](config=config, force_compute=args.force_compute, devices=devices)
         pipeline.run_anonymization_pipeline(datasets)
diff --git a/run_evaluation.py b/run_evaluation.py
index d81ab07..13b4976 100644
--- a/run_evaluation.py
+++ b/run_evaluation.py
@@ -1,4 +1,5 @@
 # We need to set CUDA_VISIBLE_DEVICES before we import Pytorch so we will read all arguments directly on startup
+import logging
 import os
 from argparse import ArgumentParser
 from pathlib import Path
@@ -74,7 +75,7 @@ def find_asv_model_checkpoint(model_dir):
 
 
 def asv_train(train_params, output_dir):
-    print(f'Train ASV model: {output_dir}')
+    logging.info(f'Train ASV model: {output_dir}')
     hparams = {
         'pretrained_path': str(train_params['pretrained_model']),
         'batch_size': train_params['batch_size'],
@@ -103,7 +104,7 @@ def asv_train(train_params, output_dir):
 
 def asv_eval(eval_datasets, eval_data_dir, params, device, anon_data_suffix, model_dir=None):
     model_dir = model_dir or find_asv_model_checkpoint(params['model_dir'])
-    print(f'Use ASV model for evaluation: {model_dir}')
+    logging.info(f'Use ASV model for evaluation: {model_dir}')
 
     save_dir = params['evaluation']['results_dir'] / f'{params["evaluation"]["distance"]}_out'
     asv = ASV(model_dir=model_dir, device=device, score_save_dir=save_dir, distance=params['evaluation']['distance'],
@@ -121,7 +122,7 @@ def asv_eval(eval_datasets, eval_data_dir, params, device, anon_data_suffix, mod
             EER = asv.eer_compute(enrol_dir=eval_data_dir / enroll_name, test_dir=eval_data_dir / test_name,
                                   trial_runs_file=eval_data_dir / trial / 'trials')
 
-            print(f'{enroll_name}-{test_name}: {scenario.upper()}-EER={EER}')
+            logging.info(f'{enroll_name}-{test_name}: {scenario.upper()}-EER={EER}')
             trials_info = trial.split('_')
             gender = trials_info[3]
             if 'common' in trial:
@@ -131,7 +132,7 @@ def asv_eval(eval_datasets, eval_data_dir, params, device, anon_data_suffix, mod
                             'trial': 'original' if scenario[1] == 'o' else 'anon', 'EER': round(EER * 100, 3)})
 
     results_df = pd.DataFrame(results)
-    print(results_df)
+    logging.info(results_df)
     results_df.to_csv(save_dir / 'results.csv')
 
 
@@ -164,7 +165,7 @@ def get_similarity_matrix(vd_model, out_dir, exp_name, segments_folder):
                                   **vd_settings)
         vd_orig, vd_anon = None, None
         save_dir_orig, save_dir_anon = None, None
-        print(f'Use ASV model {spk_ext_model_dir} for computing voice similarities of original and anonymized speakers')
+        logging.info(f'Use ASV model {spk_ext_model_dir} for computing voice similarities of original and anonymized speakers')
     elif 'orig_model_dir' in params['asv_params'] and 'anon_model_dir' in params['asv_params']:
         # use different ASV models for original and anon speaker spaces
         spk_ext_model_dir_orig = find_asv_model_checkpoint(params['asv_params']['orig_model_dir'])
@@ -176,7 +177,7 @@ def get_similarity_matrix(vd_model, out_dir, exp_name, segments_folder):
         vd_anon = VoiceDistinctiveness(spk_ext_model_dir=spk_ext_model_dir_anon,  score_save_dir=save_dir_anon,
                                        **vd_settings)
         vd = None
-        print(f'Use ASV model {spk_ext_model_dir_orig} for computing voice similarities of original speakers and ASV '
+        logging.info(f'Use ASV model {spk_ext_model_dir_orig} for computing voice similarities of original speakers and ASV '
               f'model {spk_ext_model_dir_anon} for voice similarities of anonymized speakers')
     else:
         raise ValueError('GVD: You either need to specify one "model_dir" for both original and anonymized data or '
@@ -208,11 +209,11 @@ def get_similarity_matrix(vd_model, out_dir, exp_name, segments_folder):
         gvd_value = vd.gvd(oo_sim, pp_sim) if vd else vd_orig.gvd(oo_sim, pp_sim)
         with open(trial_out_dir / 'gain_of_voice_distinctiveness', 'w') as f:
             f.write(str(gvd_value))
-        print(f'{trial} gvd={gvd_value}')
+        logging.info(f'{trial} gvd={gvd_value}')
 
 
 def asr_train(params: dict, libri_dir: Path, model_name: str, model_dir: Path, anon_data_suffix: str):
-    print(f'Train ASR model: {model_dir}')
+    logging.info(f'Train ASR model: {model_dir}')
     exp_dir = Path('exp', model_name)
     libri_dir = Path(libri_dir).expanduser() # could be relative to userdir
     ngpu = min(params.get('num_gpus', 0), torch.cuda.device_count())  # cannot use more gpus than available
@@ -250,7 +251,7 @@ def asr_train(params: dict, libri_dir: Path, model_name: str, model_dir: Path, a
 
     cwd = Path.cwd()
     os.chdir('evaluation/utility/asr')  # espnet recipe needs several files at specific relative positions
-    print(Path.cwd())
+    logging.debug(Path.cwd())
     subprocess.run(['./asr.sh'] + train_params)
 
     subprocess.run(['ln', '-srf', exp_dir, model_dir])
@@ -258,7 +259,7 @@ def asr_train(params: dict, libri_dir: Path, model_name: str, model_dir: Path, a
 
 
 def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_path, libri_dir, anon_data_suffix):
-    print(f'Use ASR model for evaluation: {model_path}')
+    logging.info(f'Use ASR model for evaluation: {model_path}')
     test_sets = []
 
     for asr_dataset in eval_datasets:
@@ -292,6 +293,8 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat
 
 
 if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s- %(levelname)s - %(message)s')
+
     params = parse_yaml(Path('configs', args.config))
     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 
@@ -312,9 +315,9 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat
                 asv_train_params = asv_params['training']
                 if not model_dir.exists() or asv_train_params.get('retrain', True) is True:
                     start_time = time.time()
-                    print('Perform ASV training')
+                    logging.info('Perform ASV training')
                     asv_train(train_params=asv_train_params, output_dir=asv_params['model_dir'])
-                    print("ASV training time: %f min ---" % (float(time.time() - start_time) / 60))
+                    logging.info("ASV training time: %f min ---" % (float(time.time() - start_time) / 60))
                     model_dir = scan_checkpoint(model_dir, 'CKPT')
                     if asv_params['vec_type'] == 'xvector':
                         shutil.copy('evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml', model_dir)
@@ -322,11 +325,11 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat
                         shutil.copy('evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml', model_dir)
 
             if 'evaluation' in asv_params:
-                print('Perform ASV evaluation')
+                logging.info('Perform ASV evaluation')
                 start_time = time.time()
                 asv_eval(eval_datasets=eval_data_trials, eval_data_dir=eval_data_dir, params=asv_params, device=device,
                          model_dir=model_dir, anon_data_suffix=anon_suffix)
-                print("--- EER computation time: %f min ---" % (float(time.time() - start_time) / 60))
+                logging.info("--- EER computation time: %f min ---" % (float(time.time() - start_time) / 60))
 
     if 'utility' in eval_steps:
         if 'asr' in eval_steps['utility']:
@@ -344,10 +347,10 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat
 
                 if not model_dir.exists() or asr_train_params.get('retrain', True) is True:
                     start_time = time.time()
-                    print('Perform ASR training')
+                    logging.info('Perform ASR training')
                     asr_train(params=asr_train_params, libri_dir=libri_dir, model_name=model_name,
                               model_dir=model_dir, anon_data_suffix=anon_suffix)
-                    print("--- ASR training time: %f min ---" % (float(time.time() - start_time) / 60))
+                    logging.info("--- ASR training time: %f min ---" % (float(time.time() - start_time) / 60))
 
             if 'evaluation' in asr_params:
                 asr_eval_params = asr_params['evaluation']
@@ -358,15 +361,15 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat
                         asr_model_path = model_dir / 'asr_train_asr_transformer_raw_en_bpe5000'
 
                 start_time = time.time()
-                print('Perform ASR evaluation')
+                logging.info('Perform ASR evaluation')
                 asr_eval_sh(eval_datasets=eval_data_asr, eval_data_dir=eval_data_dir, params=asr_eval_params,
                             model_path=asr_model_path, anon_data_suffix=anon_suffix, libri_dir=libri_dir)
-                print("--- ASR evaluation time: %f min ---" % (float(time.time() - start_time) / 60))
+                logging.info("--- ASR evaluation time: %f min ---" % (float(time.time() - start_time) / 60))
 
         if 'gvd' in eval_steps['utility']:
             gvd_params = params['utility']['gvd']
             start_time = time.time()
-            print('Perform GVD evaluation')
+            logging.info('Perform GVD evaluation')
             gvd_eval(eval_datasets=eval_data_trials, eval_data_dir=eval_data_dir, params=gvd_params, device=device,
                      anon_data_suffix=anon_suffix)
-            print("--- GVD  computation time: %f min ---" % (float(time.time() - start_time) / 60))
+            logging.info("--- GVD  computation time: %f min ---" % (float(time.time() - start_time) / 60))
diff --git a/utils/data_io.py b/utils/data_io.py
index 0bd03ee..4dabab0 100644
--- a/utils/data_io.py
+++ b/utils/data_io.py
@@ -2,7 +2,9 @@
 from hyperpyyaml import load_hyperpyyaml, dump_hyperpyyaml
 import json
 import pandas as pd
+import logging
 
+logger = logging.getLogger(__name__)
 
 def read_kaldi_format(filename, return_as_dict=True, values_as_string=False):
     key_list = []
@@ -56,7 +58,7 @@ def save_kaldi_format(data, filename):
                 #value = value.encode('utf-8')
                 f.write(f'{key} {value}\n')
             except UnicodeEncodeError:
-                print(f'{key} {value}')
+                logger.error(f'{key} {value}')
                 raise
 
 

From d7f08778bc6bc4bec3df4d23015078169f7cdf99 Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Fri, 22 Dec 2023 10:30:59 +0100
Subject: [PATCH 32/33] Fix GVD accidentally disabled on one of the configs

---
 configs/eval_pre_ecapa_cos.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml
index 672224e..bf9ec9e 100644
--- a/configs/eval_pre_ecapa_cos.yaml
+++ b/configs/eval_pre_ecapa_cos.yaml
@@ -8,7 +8,7 @@ eval_steps:  # all metrics in this list will be computed in the evaluation. Remo
     - asv
   utility:
     - asr
-    # - gvd
+    - gvd
 
 anon_data_suffix: res  # suffix for dataset to signal that it is anonymized
 eval_data_dir: results/formatted_data/26-11-23_21:40/  # path to anonymized evaluation data in kaldi format, e.g. <eval_data_dir>/libri_test_enrolls/wav.scp etc.

From a68c555c6f0e8bd84793c1cf44ef0dcf587fea5f Mon Sep 17 00:00:00 2001
From: Unal Ege Gaznepoglu <uenal.ege.gaznepoglu@iis.fraunhofer.de>
Date: Fri, 22 Dec 2023 14:26:04 +0100
Subject: [PATCH 33/33] Dump settings of the pool anonymizer

---
 .../modules/speaker_embeddings/anonymization/pool_anon.py  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
index c24151f..6a822bd 100644
--- a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
+++ b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py
@@ -105,8 +105,11 @@ def __init__(
             suffix (str): Suffix to append to the output folder names.
 
         """
-        print(locals())
-        super().__init__(vec_type=vec_type, device=device, suffix=suffix)
+        # filter out kwargs that are not used in this class
+        kwargs = locals()
+        kwargs.pop("self")
+        kwargs.pop("__class__")
+        super().__init__(**kwargs)
 
         self.model_name = model_name if model_name else f"pool_{vec_type}"