From 1043e90942eefd340916facd54393f8cb57e2da8 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 16 Nov 2023 14:24:23 +0100 Subject: [PATCH 01/33] Fix incomplete variable name refactoring - Completed a variable name refactoring (from vec_level to emb_level) across multiple files. --- .../speaker_embeddings/anonymization/gan_anon.py | 2 +- .../speaker_embeddings/anonymization/pool_anon.py | 4 ++-- .../speaker_embeddings/speaker_anonymization.py | 2 +- .../modules/speaker_embeddings/speaker_embeddings.py | 8 ++++---- .../modules/speaker_embeddings/speaker_extraction.py | 12 ++++++------ anonymization/pipelines/sttts_pipeline.py | 2 +- configs/anon_ims_sttts_pc.yaml | 4 ++-- evaluation/privacy/asv/asv.py | 8 ++++---- evaluation/utility/voice_distinctiveness/deid_gvd.py | 2 +- 9 files changed, 22 insertions(+), 22 deletions(-) diff --git a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py index cc2b7fd..6066574 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py @@ -50,7 +50,7 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): anon_vectors.append(anon_vec) genders.append(gender) - anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, vec_level=emb_level) + anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, emb_level=emb_level) anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), speakers=speakers, genders=genders) if self.save_intermediate: diff --git a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py index bcfdb7a..6ef578e 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py @@ -58,7 +58,7 @@ def __init__(self, vec_type='xvector', device=None, model_name=None, pool_data_d def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_dir): print(pool_data_dir) if pool_vec_path.exists(): - pool_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level='spk', device=self.device) + pool_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='spk', device=self.device) pool_embeddings.load_vectors(pool_vec_path) else: extraction_settings = {'vec_type': self.vec_type, 'emb_level': 'spk'} @@ -90,7 +90,7 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): anon_vectors.append(anon_vec) genders.append(gender if not self.cross_gender else REVERSED_GENDERS[gender]) - anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, vec_level=emb_level) + anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, emb_level=emb_level) anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), speakers=speakers, genders=genders) diff --git a/anonymization/modules/speaker_embeddings/speaker_anonymization.py b/anonymization/modules/speaker_embeddings/speaker_anonymization.py index 4374236..929a55d 100644 --- a/anonymization/modules/speaker_embeddings/speaker_anonymization.py +++ b/anonymization/modules/speaker_embeddings/speaker_anonymization.py @@ -36,7 +36,7 @@ def anonymize_embeddings(self, speaker_embeddings, dataset_name): # simply load them print('No computation of anonymized embeddings necessary; load existing anonymized speaker embeddings ' 'instead...') - anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level=self.emb_level, device=self.device) + anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level=self.emb_level, device=self.device) anon_embeddings.load_vectors(dataset_results_dir) return anon_embeddings else: diff --git a/anonymization/modules/speaker_embeddings/speaker_embeddings.py b/anonymization/modules/speaker_embeddings/speaker_embeddings.py index 5697a73..f0f57c6 100644 --- a/anonymization/modules/speaker_embeddings/speaker_embeddings.py +++ b/anonymization/modules/speaker_embeddings/speaker_embeddings.py @@ -7,9 +7,9 @@ class SpeakerEmbeddings: - def __init__(self, vec_type='xvector', vec_level='spk', device=torch.device('cpu')): + def __init__(self, vec_type='xvector', emb_level='spk', device=torch.device('cpu')): self.vec_type = vec_type - self.vec_level = vec_level + self.emb_level = emb_level self.device = device self.identifiers2idx = {} @@ -120,7 +120,7 @@ def get_spk2gender(self): return {speaker: gender for speaker, gender in zip(self.original_speakers, self.genders)} def convert_to_spk_level(self, method='average'): - assert self.vec_level == 'utt', \ + assert self.emb_level == 'utt', \ 'Speaker embeddings must be on utterance level to be able to convert them to speaker level!' if method == 'average': @@ -128,7 +128,7 @@ def convert_to_spk_level(self, method='average'): for i, speaker in enumerate(self.original_speakers): spk2idx[speaker].append(i) - spk_level_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level='spk', device=self.device) + spk_level_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='spk', device=self.device) spk_vectors, speakers, genders = [], [], [] if not isinstance(self.vectors, torch.Tensor): self.vectors = torch.tensor(self.vectors) diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py index 9fcbf87..171d43b 100644 --- a/anonymization/modules/speaker_embeddings/speaker_extraction.py +++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py @@ -26,7 +26,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode self.force_compute = force_compute if force_compute else settings.get('force_compute_extraction', False) self.vec_type = settings['vec_type'] - self.vec_level = settings['vec_level'] + self.emb_level = settings['emb_level'] if results_dir: self.results_dir = results_dir @@ -40,7 +40,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode self.model_hparams = { 'vec_type': self.vec_type, - 'model_path': settings.get('vec_model_path') or model_dir + 'model_path': settings.get('embed_model_path') } if self.n_processes > 1: @@ -48,15 +48,15 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode else: self.extractors = create_extractors(hparams=self.model_hparams, device=self.devices[0]) - def extract_speakers(self, dataset_path, dataset_name=None, vec_level=None): + def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None): dataset_name = dataset_name if dataset_name is not None else dataset_path.name dataset_results_dir = self.results_dir / dataset_name if self.save_intermediate else Path('') utt2spk = read_kaldi_format(dataset_path / 'utt2spk') wav_scp = read_kaldi_format(dataset_path / 'wav.scp') spk2gender = read_kaldi_format(dataset_path / 'spk2gender') - vec_level = vec_level if vec_level is not None else self.vec_level + emb_level = emb_level if emb_level is not None else self.emb_level - speaker_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level='utt', device=self.devices[0]) + speaker_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='utt', device=self.devices[0]) if (dataset_results_dir / 'speaker_vectors.pt').exists() and not self.force_compute: print('No speaker extraction necessary; load existing embeddings instead...') @@ -86,7 +86,7 @@ def extract_speakers(self, dataset_path, dataset_name=None, vec_level=None): speaker_embeddings.set_vectors(vectors=vectors, identifiers=utts, speakers=speakers, genders=genders) - if vec_level == 'spk': + if emb_level == 'spk': speaker_embeddings = speaker_embeddings.convert_to_spk_level() if self.save_intermediate: speaker_embeddings.save_vectors(dataset_results_dir) diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py index 7a6186f..a0fa36d 100644 --- a/anonymization/pipelines/sttts_pipeline.py +++ b/anonymization/pipelines/sttts_pipeline.py @@ -92,7 +92,7 @@ def run_anonymization_pipeline(self, datasets, prepare_results=True): # Step 3: Synthesize wav_scp = self.speech_synthesis.synthesize_speech(dataset_name=dataset_name, texts=texts, speaker_embeddings=anon_embeddings, - prosody=anon_prosody, emb_level=anon_embeddings.vec_level) + prosody=anon_prosody, emb_level=anon_embeddings.emb_level) anon_wav_scps[dataset_name] = wav_scp print('Done') diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml index 3eaf4a9..4b0f4fa 100644 --- a/configs/anon_ims_sttts_pc.yaml +++ b/configs/anon_ims_sttts_pc.yaml @@ -50,8 +50,8 @@ modules: emb_level: spk # possible: spk, utt anon_method: gan # possible: pool, random anon_name: gan_style-embed - extraction_results_path: !ref /original_speaker_embeddings/_-level - anon_results_path: !ref /anon_speaker_embeddings/_-level + extraction_results_path: !ref /original_speaker_embeddings/_-level + anon_results_path: !ref /anon_speaker_embeddings/_-level # pool_anon_settings are only used if anon_method == pool pool_anon_settings: diff --git a/evaluation/privacy/asv/asv.py b/evaluation/privacy/asv/asv.py index 345923f..aea99ef 100644 --- a/evaluation/privacy/asv/asv.py +++ b/evaluation/privacy/asv/asv.py @@ -33,7 +33,7 @@ def __init__(self, model_dir, device, score_save_dir, distance='plda', plda_sett self.extractor = SpeakerExtraction(results_dir=self.score_save_dir / 'emb_xvect', model_dir=model_dir, devices=[self.device], - settings={'vec_type': vec_type, 'vec_level': 'utt'}) + settings={'vec_type': vec_type, 'emb_level': 'utt'}) def compute_trial_scores(self, trials, enrol_indices, test_indices, out_file, sim_scores): scores = [] @@ -85,8 +85,8 @@ def change_id_format(data_dict): def eer_compute(self, enrol_dir, test_dir, trial_runs_file): # Compute all enrol(spk level) and Test(utt level) embeddings # enroll vectors are the speaker-level average vectors - enrol_all_dict = self.extractor.extract_speakers(dataset_path=Path(enrol_dir), vec_level='spk') - test_all_dict = self.extractor.extract_speakers(dataset_path=Path(test_dir), vec_level='utt') + enrol_all_dict = self.extractor.extract_speakers(dataset_path=Path(enrol_dir), emb_level='spk') + test_all_dict = self.extractor.extract_speakers(dataset_path=Path(test_dir), emb_level='utt') enrol_vectors = [] enrol_ids = [] @@ -148,7 +148,7 @@ def compute_distances(self, enrol_vectors, enrol_ids, test_vectors, test_ids): out_dir=plda_data_dir) print(f'Using data under {plda_data_dir}') - train_dict = self.extractor.extract_speakers(dataset_path=plda_data_dir, vec_level='utt') + train_dict = self.extractor.extract_speakers(dataset_path=plda_data_dir, emb_level='utt') self.plda = PLDAModel(train_embeddings=train_dict, results_path=self.plda_model_dir) plda_score_object = self.plda.compute_distance(enrollment_vectors=enrol_vectors, enrollment_ids=enrol_ids, diff --git a/evaluation/utility/voice_distinctiveness/deid_gvd.py b/evaluation/utility/voice_distinctiveness/deid_gvd.py index 1f47848..558eeb7 100644 --- a/evaluation/utility/voice_distinctiveness/deid_gvd.py +++ b/evaluation/utility/voice_distinctiveness/deid_gvd.py @@ -19,7 +19,7 @@ def __init__(self, spk_ext_model_dir, device, score_save_dir, plda_settings=None self.num_per_spk = num_per_spk self.extractor = SpeakerExtraction(results_dir=score_save_dir / 'emb_xvect', model_dir=spk_ext_model_dir, - devices=[device], settings={'vec_type': vec_type, 'vec_level': 'utt'}) + devices=[device], settings={'vec_type': vec_type, 'emb_level': 'utt'}) self.asv = ASV(model_dir=spk_ext_model_dir, device=device, score_save_dir=score_save_dir, distance=distance, plda_settings=plda_settings, vec_type=vec_type) From 24ff004e1594c7ea938baf1cce8477812f4e3541 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 21 Nov 2023 18:57:18 +0100 Subject: [PATCH 02/33] Fix bug: loading of 'spk' level embeddings into 'utt' level SpeakerEmbeddings object --- anonymization/modules/speaker_embeddings/speaker_extraction.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py index 171d43b..2fca0a5 100644 --- a/anonymization/modules/speaker_embeddings/speaker_extraction.py +++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py @@ -61,6 +61,8 @@ def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None): if (dataset_results_dir / 'speaker_vectors.pt').exists() and not self.force_compute: print('No speaker extraction necessary; load existing embeddings instead...') speaker_embeddings.load_vectors(dataset_results_dir) + # assume the loaded vectors are computed according to the setting in config + speaker_embeddings.emb_level = emb_level else: print(f'Extract embeddings of {len(wav_scp)} utterances') speaker_embeddings.new = True From 8206441cb3fd824a02215cf65f14b2025bbee369 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 23 Nov 2023 16:12:01 +0100 Subject: [PATCH 03/33] Allow relative paths for `data_dir` --- utils/path_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/path_management.py b/utils/path_management.py index b2cdabd..c62cb25 100644 --- a/utils/path_management.py +++ b/utils/path_management.py @@ -39,7 +39,7 @@ def scan_checkpoint(cp_dir, prefix): def get_datasets(config): datasets = {} - data_dir = config.get('data_dir', None) + data_dir = config.get('data_dir', None).expanduser() # if '~' is given in path then manually expand for dataset in config['datasets']: if data_dir: for subset in dataset['enrolls'] + dataset['trials']: From e7efc5a35a81a9cfd9e2ea2143188fedd82ad0af Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 23 Nov 2023 16:50:30 +0100 Subject: [PATCH 04/33] Dependency injection for anonymizer loading - Now the anonymization pipeline is completely separated from the anonymizer object. - Specify a BaseAnonymizer subclass in a config, then import that using !include syntax. Example is shown for GAN anonymizer. - Add a Passthrough anonymizer --- .../anonymization/base_anon.py | 4 ++ .../anonymization/gan_anon.py | 3 +- .../anonymization/passthrough.py | 26 ++++++++ .../anonymization/random_anon.py | 16 ++++- .../speaker_anonymization.py | 38 ++++-------- .../speaker_embeddings/speaker_extraction.py | 2 +- anonymization/pipelines/sttts_pipeline.py | 24 +++----- configs/anon/ims_gan.yaml | 9 +++ configs/anon/passthrough.yaml | 1 + configs/anon/pool.yaml | 11 ++++ configs/anon/random.yaml | 6 ++ configs/anon_ims_sttts_pc.yaml | 59 +++---------------- configs/datasets_vpc2022_official.yaml | 20 +++++++ 13 files changed, 123 insertions(+), 96 deletions(-) create mode 100644 anonymization/modules/speaker_embeddings/anonymization/passthrough.py create mode 100644 configs/anon/ims_gan.yaml create mode 100644 configs/anon/passthrough.yaml create mode 100644 configs/anon/pool.yaml create mode 100644 configs/anon/random.yaml create mode 100644 configs/datasets_vpc2022_official.yaml diff --git a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py index e9a804a..62fb888 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py @@ -7,6 +7,7 @@ class BaseAnonymizer: def __init__(self, vec_type='xvector', device=None, **kwargs): # Base class for speaker embedding anonymization. self.vec_type = vec_type + self.suffix = '_anon' if isinstance(device, torch.device): self.device = device @@ -20,3 +21,6 @@ def __init__(self, vec_type='xvector', device=None, **kwargs): def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): # Template method for anonymizing a dataset. Not implemented. raise NotImplementedError('anonymize_data') + + def to(self, device): + self.device = device diff --git a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py index 6066574..f1af431 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py @@ -1,3 +1,4 @@ +from pathlib import Path import torch import numpy as np from scipy.spatial.distance import cosine @@ -15,7 +16,7 @@ def __init__(self, vec_type='xvector', device=None, model_name=None, vectors_fil super().__init__(vec_type=vec_type, device=device) self.model_name = model_name if model_name else f'gan_{vec_type}' - self.vectors_file = vectors_file + self.vectors_file = Path(vectors_file) self.unused_indices_file = self.vectors_file.with_name(f'unused_indices_{self.vectors_file.name}') self.sim_threshold = sim_threshold self.save_intermediate = save_intermediate diff --git a/anonymization/modules/speaker_embeddings/anonymization/passthrough.py b/anonymization/modules/speaker_embeddings/anonymization/passthrough.py new file mode 100644 index 0000000..742071f --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/passthrough.py @@ -0,0 +1,26 @@ +from .base_anon import BaseAnonymizer +import torch + +class Passthrough(BaseAnonymizer): + + def __init__(self, vec_type='xvector', device=None, **kwargs): + # Base class for speaker embedding anonymization. + self.vec_type = vec_type + self.suffix = '_res' + + if isinstance(device, torch.device): + self.device = device + elif isinstance(device, str): + self.device = torch.device(device) + elif isinstance(device, int): + self.device = torch.device(f'cuda:{device}') + else: + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): + # no need to refer to emb_level, + # as extractor also yields spk-level or utt-level. + return speaker_embeddings + + def to(self, device): + self.device = device diff --git a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py index 6a0c059..764e60d 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py @@ -14,11 +14,21 @@ def __init__(self, vec_type='xvector', device=None, model_name=None, in_scale=F super().__init__(vec_type=vec_type, device=device) self.model_name = model_name if model_name else f'random_{vec_type}' - + if in_scale: - self.scaling_ranges = self._load_scaling_ranges(stats_per_dim_path=stats_per_dim_path) + self.stats_per_dim_path = stats_per_dim_path else: - self.scaling_ranges = None + self.stats_per_dim_path = None + self._scaling_ranges = None + + @property + def scaling_ranges(self): + # defer loading of stats until they are first needed + # required after anonymizer initialization is delegated to HyperPyYAML + if self.stats_per_dim_path is not None: + self._scaling_ranges = self._load_scaling_ranges(stats_per_dim_path=self.stats_per_dim_path) + self.stats_per_dim_path = None + return self._scaling_ranges def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): if self.scaling_ranges: diff --git a/anonymization/modules/speaker_embeddings/speaker_anonymization.py b/anonymization/modules/speaker_embeddings/speaker_anonymization.py index 929a55d..5c82113 100644 --- a/anonymization/modules/speaker_embeddings/speaker_anonymization.py +++ b/anonymization/modules/speaker_embeddings/speaker_anonymization.py @@ -1,6 +1,6 @@ from pathlib import Path -from .anonymization import PoolAnonymizer, RandomAnonymizer, GANAnonymizer +from .anonymization.base_anon import BaseAnonymizer from .speaker_embeddings import SpeakerEmbeddings @@ -26,6 +26,10 @@ def __init__(self, vectors_dir, device, settings, results_dir=None, save_interme raise ValueError('Results dir must be specified in parameters or settings!') self.anonymizer = self._load_anonymizer(settings) + + @property + def suffix(self): + return self.anonymizer.suffix def anonymize_embeddings(self, speaker_embeddings, dataset_name): dataset_results_dir = self.results_dir / dataset_name if self.save_intermediate else '' @@ -48,27 +52,11 @@ def anonymize_embeddings(self, speaker_embeddings, dataset_name): anon_embeddings.save_vectors(dataset_results_dir) return anon_embeddings - def _load_anonymizer(self, settings): - anon_method = settings['anon_method'] - vec_type = settings.get('vec_type', 'xvector') - model_name = settings.get('anon_name', None) - - if anon_method == 'random': - anon_settings = settings.get('random_anon_settings', {}) - model = RandomAnonymizer(vec_type=vec_type, device=self.device, model_name=model_name, **anon_settings) - - elif anon_method == 'pool': - anon_settings = settings.get('pool_anon_settings', {}) - model = PoolAnonymizer(vec_type=vec_type, device=self.device, model_name=model_name, - embed_model_dir=settings.get('embed_model_path', Path()), - save_intermediate=self.save_intermediate, **anon_settings) - - elif anon_method == 'gan': - anon_settings = settings.get('gan_anon_settings', {}) - model = GANAnonymizer(vec_type=vec_type, device=self.device, model_name=model_name, - save_intermediate=self.save_intermediate, **anon_settings) - else: - raise ValueError(f'Unknown anonymization method {anon_method}') - - print(f'Model type of anonymizer: {model_name}') - return model + def _load_anonymizer(self, settings: dict): + anon_method = settings['anon_method'] #HyperPyYAML already does the loading + assert isinstance(anon_method, BaseAnonymizer), \ + 'The anonymizer must be an instance of BaseAnonymizer, or a ' \ + f'subclass of it, but received an instance of {type(anon_method)}' + + print(f'Model type of anonymizer: {type(anon_method).__name__}') + return anon_method diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py index 2fca0a5..d986903 100644 --- a/anonymization/modules/speaker_embeddings/speaker_extraction.py +++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py @@ -127,7 +127,7 @@ def extraction_job(data): try: spk_embs = [extractor.extract_vector(audio=norm_wave, sr=fs) for extractor in speaker_extractors] - except RuntimeError: + except RuntimeError as e: print(f'Runtime error: {utt}, {signal.shape}, {norm_wave.shape}') continue diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py index a0fa36d..a7f1a6d 100644 --- a/anonymization/pipelines/sttts_pipeline.py +++ b/anonymization/pipelines/sttts_pipeline.py @@ -33,14 +33,12 @@ def __init__(self, config, force_compute_all, devices): self.speaker_extraction = SpeakerExtraction(model_dir=model_dir, devices=devices, save_intermediate=save_intermediate, settings=modules_config['speaker_embeddings'], - force_compute=force_compute_all) - if 'anonymizer' in modules_config['speaker_embeddings']: - self.speaker_anonymization = SpeakerAnonymization(vectors_dir=vectors_dir, device=devices[0], - save_intermediate=save_intermediate, - settings=modules_config['speaker_embeddings'], - force_compute=force_compute_all) - else: - self.speaker_anonymization = None + force_compute=force_compute_all, + ) + self.speaker_anonymization = SpeakerAnonymization(vectors_dir=vectors_dir, device=devices[0], + save_intermediate=save_intermediate, + settings=modules_config['speaker_embeddings'], + force_compute=force_compute_all) # Prosody component if 'prosody' in modules_config: @@ -78,11 +76,7 @@ def run_anonymization_pipeline(self, datasets, prepare_results=True): prosody = None # Step 2: Anonymize speaker, change prosody - if self.speaker_anonymization: - anon_embeddings = self.speaker_anonymization.anonymize_embeddings(speaker_embeddings=spk_embeddings, - dataset_name=dataset_name) - else: - anon_embeddings = spk_embeddings + anon_embeddings = self.speaker_anonymization.anonymize_embeddings(speaker_embeddings=spk_embeddings,dataset_name=dataset_name) if self.prosody_anonymization: anon_prosody = self.prosody_anonymization.anonymize_prosody(prosody=prosody) @@ -99,13 +93,11 @@ def run_anonymization_pipeline(self, datasets, prepare_results=True): if prepare_results: if self.speaker_anonymization: anon_vectors_path = self.speaker_anonymization.results_dir, - anon_suffix = '_anon' else: anon_vectors_path = self.speaker_extraction.results_dir - anon_suffix = '_res' now = datetime.strftime(datetime.today(), '%d-%m-%y_%H:%M') prepare_evaluation_data(dataset_dict=datasets, anon_wav_scps=anon_wav_scps, - anon_vectors_path=anon_vectors_path, anon_suffix=anon_suffix, + anon_vectors_path=anon_vectors_path, anon_suffix=self.speaker_anonymization.suffix, output_path=self.results_dir / 'formatted_data' / now) save_yaml(self.config, self.results_dir / 'formatted_data' / now / 'config.yaml') diff --git a/configs/anon/ims_gan.yaml b/configs/anon/ims_gan.yaml new file mode 100644 index 0000000..8e19edc --- /dev/null +++ b/configs/anon/ims_gan.yaml @@ -0,0 +1,9 @@ +!new:anonymization.modules.speaker_embeddings.anonymization.gan_anon.GANAnonymizer + models_dir: null # overridden by main .yaml + vec_type: null # overriden by main .yaml + save_intermediate: null # overriden by main .yaml + model_name: null # overriden by constructor + vectors_file: !ref /anonymization/style-embed_wgan.pt + gan_model_path: !ref /anonymization/gan_/_wgan.pt + num_sampled: 5000 + sim_threshold: 0.7 \ No newline at end of file diff --git a/configs/anon/passthrough.yaml b/configs/anon/passthrough.yaml new file mode 100644 index 0000000..36eac4a --- /dev/null +++ b/configs/anon/passthrough.yaml @@ -0,0 +1 @@ +!new:anonymization.modules.speaker_embeddings.anonymization.passthrough.Passthrough \ No newline at end of file diff --git a/configs/anon/pool.yaml b/configs/anon/pool.yaml new file mode 100644 index 0000000..ccdfd8d --- /dev/null +++ b/configs/anon/pool.yaml @@ -0,0 +1,11 @@ +# pool_anon_settings are only used if anon_method == pool +pool_data_dir: !ref /libritts_train_other_500 +pool_vec_path: !ref /style-embed_spk-level/pool_embeddings +N: 200 +N_star: 100 +distance: plda # possible: plda, cosine +plda_dir: !ref /distances/plda/libritts_train_other_500_xvector +cross_gender: false +proximity: farthest # possible: farthest, nearest, center +scaling: maxmin # possible: none, maxmin, mean +stats_per_dim_path: !ref /anonymization/stats_per_dim.json \ No newline at end of file diff --git a/configs/anon/random.yaml b/configs/anon/random.yaml new file mode 100644 index 0000000..c12afe6 --- /dev/null +++ b/configs/anon/random.yaml @@ -0,0 +1,6 @@ +!new:anonymization.modules.speaker_embeddings.anonymization.random_anon.RandomAnonymizer + models_dir: null #will be overridden by main .yaml + vec_type: null #will be overridden by main .yaml + model_name: null #will be overridden by constructor + in_scale: false + stats_per_dim_path: !ref /anonymization/random_in-scale_/stats_per_dim.json \ No newline at end of file diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml index 4b0f4fa..e135243 100644 --- a/configs/anon_ims_sttts_pc.yaml +++ b/configs/anon_ims_sttts_pc.yaml @@ -2,33 +2,15 @@ root_dir : .. data_dir: !ref /data # TODO adjust path save_intermediate: true save_output: true -force_compute_all: false - -datasets: - - name: libri_dev - data: libri - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: libri_test - data: libri - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f_all, trials_m_all] - - name: vctk_test - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f_all, trials_m_all] results_dir: !ref /results # TODO adjust path models_dir: !ref /models # TODO adjust path vectors_dir: !ref /results/original_speaker_embeddings + +force_compute_all: false +save_intermediate: true +datasets: !include:datasets_vpc2022_official.yaml + pipeline: sttts modules: @@ -48,36 +30,13 @@ modules: vec_type: style-embed embed_model_path: !ref /tts/Embedding/embedding_function.pt emb_level: spk # possible: spk, utt - anon_method: gan # possible: pool, random - anon_name: gan_style-embed + anon_method: !include:anon/ims_gan.yaml # possible: pool, random + models_dir: !ref + save_intermediate: !ref + vec_type: !ref extraction_results_path: !ref /original_speaker_embeddings/_-level anon_results_path: !ref /anon_speaker_embeddings/_-level - # pool_anon_settings are only used if anon_method == pool - pool_anon_settings: - pool_data_dir: !ref /libritts_train_other_500 - pool_vec_path: !ref /style-embed_spk-level/pool_embeddings - N: 200 - N_star: 100 - distance: plda # possible: plda, cosine - plda_dir: !ref /distances/plda/libritts_train_other_500_xvector - cross_gender: false - proximity: farthest # possible: farthest, nearest, center - scaling: maxmin # possible: none, maxmin, mean - stats_per_dim_path: !ref /anonymization/stats_per_dim.json - - # random_anon_settings are only used if anon_method == random - random_anon_settings: - in_scale: true - stats_per_dim_path: !ref /anonymization/stats_per_dim.json - - # gan_anon_settings are only used if anon_method == gan - gan_anon_settings: - vectors_file: !ref /anonymization/style-embed_wgan_generated_vectors.pt - gan_model_path: !ref /anonymization/style-embed_wgan.pt - num_sampled: 5000 - sim_threshold: 0.7 - prosody: extractor_type: ims aligner_model_path: !ref /tts/Aligner/aligner.pt diff --git a/configs/datasets_vpc2022_official.yaml b/configs/datasets_vpc2022_official.yaml new file mode 100644 index 0000000..bf7e494 --- /dev/null +++ b/configs/datasets_vpc2022_official.yaml @@ -0,0 +1,20 @@ + - name: libri_dev + data: libri + set: dev + enrolls: [enrolls] + trials: [trials_f, trials_m] + - name: libri_test + data: libri + set: test + enrolls: [enrolls] + trials: [trials_f, trials_m] + - name: vctk_dev + data: vctk + set: dev + enrolls: [enrolls] + trials: [trials_f_all, trials_m_all] + - name: vctk_test + data: vctk + set: test + enrolls: [enrolls] + trials: [trials_f_all, trials_m_all] \ No newline at end of file From 714b754a930fd0c2c3ad53e827c354df69acfca7 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 23 Nov 2023 16:51:05 +0100 Subject: [PATCH 05/33] Add utility to manipulate already existing VPC datasets - Can convert the .scp files to absolute or relative. --- utils/relative_scp_to_abs.py | 76 ++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 utils/relative_scp_to_abs.py diff --git a/utils/relative_scp_to_abs.py b/utils/relative_scp_to_abs.py new file mode 100644 index 0000000..e585870 --- /dev/null +++ b/utils/relative_scp_to_abs.py @@ -0,0 +1,76 @@ +from pathlib import Path +import typer +from typing import Optional +from typing_extensions import Annotated + + +def main( + vpc_baseline_path: Annotated[Optional[Path], typer.Option()], + to: str = 'absolute' + ): + """ + This script adapts evaluation datasets generated by the VoicePrivacy + Challenge 2022 framework codebase. The necessary changes are: + * Adapting the .scp files such that data paths are not relative to + the VPC baseline/ folder anymore but absolute. + * The changes can be also reverted with the same script. To do so, + just call it with --to=relative + """ + vpc_baseline_path = vpc_baseline_path.expanduser() + assert vpc_baseline_path.exists(), \ + f'The supplied path to VPC framework ({vpc_baseline_path}) does not exist' + + # determine the transform + if to.casefold() == 'absolute': + transform = lambda string: f'{vpc_baseline_path / string}' + elif to.casefold() == 'relative': + transform = lambda string: Path(string).relative_to(vpc_baseline_path) + + # for each dataset load .scp + dataset_list = [ + 'vctk_test_trials_m', + 'vctk_test_trials_m_common', + 'vctk_test_trials_m_all', + 'vctk_test_trials_f', + 'vctk_test_trials_f_common', + 'vctk_test_trials_f_all', + 'vctk_test_trials_all', + 'vctk_test_enrolls', + 'vctk_dev_trials_m', + 'vctk_dev_trials_m_common', + 'vctk_dev_trials_m_all', + 'vctk_dev_trials_f', + 'vctk_dev_trials_f_common', + 'vctk_dev_trials_f_all', + 'vctk_dev_trials_all', + 'vctk_dev_enrolls', + 'libri_test_trials_m', + 'libri_test_trials_f', + 'libri_test_trials_all', + 'libri_test_enrolls', + 'libri_dev_trials_m', + 'libri_dev_trials_f', + 'libri_dev_trials_all', + 'libri_dev_enrolls', + ] + for dataset in dataset_list: + dataset_path = vpc_baseline_path / 'data' / dataset + lines = [] + with open(dataset_path / 'wav.scp') as scp: + for line in scp.readlines(): + # scp format is {utt} {path}\n + items = line.split(' ') + new_line = f'{items[0]} {transform(items[1].strip())}\n' + lines.append(new_line) + # validate + for line in lines: + line = line.split(' ')[1].strip() + if to == 'relative': + line = vpc_baseline_path / line + assert Path(line).exists(), f'Line {line} has issues, exiting.' + with open(dataset_path / 'wav.scp', mode='w') as scp: + for line in lines: + scp.writelines(line) + +if __name__ == '__main__': + typer.run(main) \ No newline at end of file From fe8cdbd4fee26b94d652da4e4730ef711ae4e9a9 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 23 Nov 2023 16:52:14 +0100 Subject: [PATCH 06/33] Add makefile & environment.yaml for dependency tracking. - Makefile installs the environment local to the project folder. --- Makefile | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ environment.yaml | 47 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 Makefile create mode 100644 environment.yaml diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..714666d --- /dev/null +++ b/Makefile @@ -0,0 +1,53 @@ +############################### +## CONFIGURATION +############################### +PHONY: install uninstall pretrained_models espnet +.ONESHELL: + +PROJECT_NAME = voicepat +ENV_NAME = $(PROJECT_NAME)_env + +ifeq (, $(shell mamba --version)) +CONDA = conda +else +CONDA = mamba +endif + +############################### +##@ INSTALLATION +############################### + +install: $(ENV_NAME) ## performs the installation. Currently the only step is to install the conda environment + +espnet: ## installs ESPNet + echo Deactivated + +uninstall: + @rm -rf $(ENV_NAME) + @rm -rf ESPNet + @rm -rf models/ + +pretrained_models: ## downloads the pretrained models from IMS repositories + @echo Downloading models from IMS repositories + @rm -rf models + @mkdir -p models + @wget -q -O models/anonymization.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/anonymization.zip + @wget -q -O models/asr.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/asr.zip + @wget -q -O models/tts.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/tts.zip + @unzip -oq models/asr.zip -d models + @unzip -oq models/tts.zip -d models + @unzip -oq models/anonymization.zip -d models + @rm models/*.zip + + +$(ENV_NAME): environment.yaml + @($(CONDA) env create -f $< -p ./$@ && @echo Installation complete, please run `conda-develop .` once.) || $(CONDA) env update -f $< -p ./$@ + @conda config --set env_prompt '($$(basename {default_env})) ' + @(cat .gitignore | grep -q $(ENV_NAME)) || echo $(ENV_NAME) >> .gitignore + +############################### +##@ SELF-DOCUMENTING COMMAND +############################### + +help: ## Display this help + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..058b844 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,47 @@ +name: voicepat_env +channels: + - nvidia + - pytorch + - conda-forge + - default +dependencies: + - pip + - pip: + - speechbrain + - noisereduce + - pyloudnorm + - phonemizer + - praat-parselmouth + - espnet + - espnet_model_zoo + - python + - urllib3 + - matplotlib + - seaborn + - jupyter + - jupyterlab + - ipywidgets + - ipympl + - blas=1.0=mkl + - numpy + - cvxopt + - scipy + - scikit-learn + - pyyaml + - click + - typer + - tqdm + - openpyxl + - librosa + - resampy + - python-sounddevice + - pytorch::pytorch-cuda + - pytorch::ignite + - pytorch::torchaudio + - tensorboardx + - tensorboard + - optuna + - hydra-core + - typeguard==2.13.3 + - conda-build + - torchvision # only to prevent warnings \ No newline at end of file From e83892f073bcfb52936c86ebb3432b67a02a6c15 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 23 Nov 2023 17:09:58 +0100 Subject: [PATCH 07/33] Update "README.md"s --- README.md | 91 ++++++++++++++++++----------------------- anonymization/README.md | 5 +++ 2 files changed, 45 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 7596fdd..0988589 100644 --- a/README.md +++ b/README.md @@ -1,88 +1,77 @@ # [VoicePAT: Voice Privacy Anonymization Toolkit](http://arxiv.org/abs/2309.08049) -**Note: This repository and its documentation are still under construction but can already be used for both -anonymization and evaluation. We welcome all contributions to introduce more generation methods or evaluation metrics to the VoicPAT framework. -If you are interested in contributing, please leave comments on a GitHub issue.** - -VoicePAT is a toolkit for speaker anonymization research, with special focus on speaker anonymization. -It is based on the framework(s) by the [VoicePrivacy Challenges](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022) but contains the following improvements: - -* It consists of **two separate procedures for anonymization and evaluation**. This means that the generation of - anonymized speech is independent of the evaluation of anonymization systems. Both processes do not need to be - executed in the same run or with the same settings. Of course, you need to perform the anonymization of evaluation - data with one system before you can evaluate it but this could have happened at an earlier time and with an - external codebase. -* Anonymization and evaluation procedures are **structured as pipelines** consisting of separate **modules**. Each - module may have a selection of different models or algorithm to fulfill its role. The settings for each procedure - / pipeline are defined exclusively in configuration files. See the *Usage* section below for more information. -* **Evaluation models** have been exchanged by models based on [SpeechBrain](https://github.com/speechbrain/speechbrain/) and [ESPnet](https://github.com/espnet/espnet/) which are **more powerful** than the - previous Kaldi-based models. Furthermore, we added new techniques to make evaluation significantly **more - efficient**. -* The framework is written in **Python**, making it easy to include and adapt other Python-based models, e.g., using - PyTorch. When using the framework, you do not need in-depth knowledge about anything outside the Python realm - (Disclaimer: While being written in Python, the ASR evaluation is currently included with an ESPnet-based model - which in turn is based on Kaldi. However, you do not need to modify that part of the code for using or - changing the ASR model and ESPnet is currently working on a Kaldi-free version.) +**Note: This repository and its documentation are still under construction but can already be used for both anonymization and evaluation. We welcome all contributions to introduce more generation methods or evaluation metrics to the VoicePAT framework. If you are interested in contributing, please leave comments on a GitHub issue.** +VoicePAT is a toolkit for speaker anonymization research. It is based on the framework(s) by the [VoicePrivacy Challenges](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022) but contains the following improvements: + +* It consists of **two separate procedures for anonymization and evaluation**. This means that the generation of anonymized speech is independent of the evaluation of anonymization systems. Both processes do not need to be executed in the same run or with the same settings. Of course, you need to perform the anonymization of evaluation data with one system before you can evaluate it but this could have happened at an earlier time and with an external codebase. +* Anonymization and evaluation procedures are **structured as pipelines** consisting of separate **modules**. Each module may have a selection of different models or algorithm to fulfill its role. The settings for each procedure / pipeline are defined exclusively in configuration files. See the *Usage* section below for more information. +* **Evaluation models** have been exchanged by models based on [SpeechBrain](https://github.com/speechbrain/speechbrain/) and [ESPnet](https://github.com/espnet/espnet/) which are **more powerful** than the previous Kaldi-based models. Furthermore, we added new techniques to make evaluation significantly **more efficient**. +* The framework is written in **Python**, making it easy to include and adapt other Python-based models, e.g., using PyTorch. When using the framework, you do not need in-depth knowledge about anything outside the Python realm. (Disclaimer: While being written in Python, the ASR evaluation is currently included with an ESPnet-based model which in turn is based on Kaldi. However, you do not need to modify that part of the code for using or changing the ASR model and ESPnet is currently working on a Kaldi-free version.) ## Installation -Simply clone the repository and install the dependencies in [requirements.txt](requirements.txt). If you want to use -the ESPnet-based ASR evaluation model, you additionally need to clone and install [ESPNet](https://github.com/espnet/espnet/) and insert the link to -it in [evaluation/utility/asr/path.sh](evaluation/utility/asr/path.sh), e.g., ``MAIN_ROOT=~/espnet``. + +Requires `conda` for environment management. Installation of `mamba` is also recommended for speeding up the environment-related tasks. Simply clone the repository and run the following commands, a conda environment will be generated in the project root folder and the pretrained models will be downloaded. + +```bash +sudo apt install libespeak-ng # alternatively use your own package manager +make install pretrained_models +``` + +The datasets have to be downloaded via the VoicePrivacy Challenge framework. Once the download is complete, the `.scp` files need to be converted to the absolute path, because they are relative to the challenge folder. Use [utils/relative_scp_to_abs.py](utils/relative_scp_to_abs.py) for this purpose. Then simply point `data_path` in the YAML configurations to the data folder of the VoicePrivacy Challenge framework. + +If you want to use the ESPnet-based ASR evaluation model, you additionally need to clone and install [ESPNet](https://github.com/espnet/espnet/) and insert the link to it in [evaluation/utility/asr/path.sh](evaluation/utility/asr/path.sh), e.g., ``MAIN_ROOT=~/espnet``. ## Usage ![](figures/framework.png) -For using the toolkit with the existing methods, you can use the configuration files in [configs](configs). You can -also add more modules and models to the code and create your own config by using the existing ones as template. - +For using the toolkit with the existing methods, you can use the configuration files in [configs](configs). You can also add more modules and models to the code and create your own config by using the existing ones as template. The configuration files use HyperPyYAML syntax, for which a useful reference is available [here](https://colab.research.google.com/drive/1Pg9by4b6-8QD2iC0U7Ic3Vxq4GEwEdDz?usp=sharing). ### Anonymization + The framework currently contains only one pipeline and config for anonymization, [anon_ims_sttts_pc.yaml](configs/anon_ims_sttts_pc.yaml). If you are using this config, you need to modify at least the following entries: -``` -data_dir: path to original data in Kaldi-format for anonymization -results_dir: path to location for all (intermediate) results of the anonymization -models_dir: path to models location + +```YAML +data_dir: # path to original data in Kaldi-format for anonymization +results_dir: # path to location for all (intermediate) results of the anonymization +models_dir: # path to models location ``` Running an anonymization pipeline is done like this: + ``` python run_anonymization.py --config anon_ims_sttts_pc.yaml --gpu_ids 0,1 --force_compute ``` -This will perform all computations that support parallel computing on the gpus with ID 0 and 1, and on GPU 0 -otherwise. If no gpu_ids are specified, it will run only on GPU 0 or CPU, depending on whether cuda is available. -`--force_compute` causes all previous computations to be run again. In most cases, you can delete that flag from the -command to speed up the anonymization. -Pretrained models for this anonymization can be found at [https://github. -com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0](https://github.com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0) and earlier releases. +This will perform all computations that support parallel computing on the gpus with ID 0 and 1, and on GPU 0 otherwise. If no gpu_ids are specified, it will run only on GPU 0 or CPU, depending on whether cuda is available. `--force_compute` causes all previous computations to be run again. In most cases, you can delete that flag from the command to speed up the anonymization. + +Pretrained models for this anonymization can be found at [https://github.com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0](https://github.com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0) and earlier releases. ### Evaluation -All other config files in [configs](configs) can be used for evaluation with different settings. In these configs, -you need to adapt at least + +All other config files in [configs](configs) can be used for evaluation with different settings. In these configs, you need to adapt at least + ``` eval_data_dir: path to anonymized evaluation data in Kaldi-format asr/libri_dir: path to original LibriSpeech dataset ``` Running an evaluation pipeline is done like this: + ``` python run_evaluation.py --config eval_pre_ecapa_cos.yaml --gpu_ids 1,2,3 ``` -making the GPUs with IDs 1, 2 and 3 available to the process. If no GPU is specified, it will default to CUDA:0 or -use all GPUs if -cuda is available, or run on CPU otherwise. -Pretrained evaluation models can be found in release v1. +making the GPUs with IDs 1, 2 and 3 available to the process. If no GPU is specified, it will default to CUDA:0 or use all GPUs if cuda is available, or run on CPU otherwise. + +Pretrained evaluation models can be found in release v1. ## Acknowledgements + Several parts of this toolkit are based on or use code from external sources, i.e., -* [VoicePrivacy Challenge 2022](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022), [ESPnet](https://github.com/espnet/espnet/), [SpeechBrain](https://github.com/speechbrain/speechbrain/) for evaluation -* the [GAN-based anonymization system by IMS (University of Stuttgart)](https://github.com/DigitalPhonetics/speaker-anonymization) - for - anonymization -See the READMEs for [anonymization](anonymization/README.md) and [evaluation](evaluation/README.md) for more -information. +* [VoicePrivacy Challenge 2022](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022), [ESPnet](https://github.com/espnet/espnet/), [SpeechBrain](https://github.com/speechbrain/speechbrain/) for evaluation +* the [GAN-based anonymization system by IMS (University of Stuttgart)](https://github.com/DigitalPhonetics/speaker-anonymization) for anonymization +See the READMEs for [anonymization](anonymization/README.md) and [evaluation](evaluation/README.md) for more information. diff --git a/anonymization/README.md b/anonymization/README.md index 7006f3c..a8226ff 100644 --- a/anonymization/README.md +++ b/anonymization/README.md @@ -4,4 +4,9 @@ The anonymization branch can contain multiple pipelines, modules and models. So [Speech-to-Text-to-Speech (STTTS) pipeline](https://ieeexplore.ieee.org/document/10096607), based on this code: [https://github.com/DigitalPhonetics/speaker-anonymization](https://github.com/DigitalPhonetics/speaker-anonymization). + +# Experiment with different speaker embedding mappings + +This is now simplified: you can define your anonymizer (a function that yields a speaker embedding when a speaker embedding is supplied) using the `!new` syntax of HyperPyYAML in a config file (e.g., see [ims_gan.yaml](../configs/anon/ims_gan.yaml)). The only requirement is that your anonymizer must implement the `BaseAnonymizer` API (see [base_anon.py](modules/speaker_embeddings/anonymization/base_anon.py)). + *This documentation is still under construction and will be extended soon.* \ No newline at end of file From a20f0532307d4ab7176f2272755a614f51b34519 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Fri, 24 Nov 2023 16:39:34 +0100 Subject: [PATCH 08/33] Temporary fix to #1: unstable prosody inference - each libespeak backend instantiation creates a copy of the library. - for some reason previous instances (for each utterance) are not garbage collected. - latest at 1500 or 2000 times of ProsodyExtraction the anonymization pipeline crashes. - As a temporary solution, since ProsodyExtraction does not support `n_processes>1`, we can make different instances share the backend, as long as specs do not change. This also accelerates the prosody extraction. A better fix could be performed to also allow parallel execution but I couldn't see how today. --- .../IMSToucan/Preprocessing/TextFrontend.py | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/anonymization/modules/tts/IMSToucan/Preprocessing/TextFrontend.py b/anonymization/modules/tts/IMSToucan/Preprocessing/TextFrontend.py index db1cb9e..d30563c 100644 --- a/anonymization/modules/tts/IMSToucan/Preprocessing/TextFrontend.py +++ b/anonymization/modules/tts/IMSToucan/Preprocessing/TextFrontend.py @@ -13,6 +13,7 @@ class ArticulatoryCombinedTextFrontend: + _backend = None def __init__(self, language, @@ -156,11 +157,23 @@ def __init__(self, print("Language not supported yet") sys.exit() - self.phonemizer_backend = EspeakBackend(language=self.g2p_lang, - punctuation_marks=';:,.!?¡¿—…"«»“”~/。【】、‥،؟“”؛', - preserve_punctuation=True, - language_switch='remove-flags', - with_stress=self.use_stress) + # temporary: share the backend if config matches. this prevents multitasking. + if ArticulatoryCombinedTextFrontend._backend is None: + ArticulatoryCombinedTextFrontend._backend = EspeakBackend(language=self.g2p_lang, + punctuation_marks=';:,.!?¡¿—…"«»“”~/。【】、‥،؟“”؛', + preserve_punctuation=True, + language_switch='remove-flags', + with_stress=self.use_stress, + ) + elif ArticulatoryCombinedTextFrontend._backend.language != self.g2p_lang or \ + ArticulatoryCombinedTextFrontend._backend._with_stress != self.use_stress: + ArticulatoryCombinedTextFrontend._backend = EspeakBackend(language=self.g2p_lang, + punctuation_marks=';:,.!?¡¿—…"«»“”~/。【】、‥،؟“”؛', + preserve_punctuation=True, + language_switch='remove-flags', + with_stress=self.use_stress, + ) + self.phonemizer_backend = ArticulatoryCombinedTextFrontend._backend self.phone_to_vector = generate_feature_table() self.phone_to_id = get_phone_to_id() From d8c2f48b35f24fb3eedcac60f05fa7a966d389bc Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 28 Nov 2023 11:59:38 +0100 Subject: [PATCH 09/33] Fix #2 - filenames given in `hyperparams.yaml` interpreted as path --- .../privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml | 4 +--- evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml | 4 +--- .../privacy/asv/asv_train/hparams/xvector/hyperparams.yaml | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml b/evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml index 41928df..1b92bb2 100755 --- a/evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml +++ b/evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml @@ -48,6 +48,4 @@ modules: pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: - embedding_model: !ref - paths: - embedding_model: !ref embedding_model.ckpt + embedding_model: !ref \ No newline at end of file diff --git a/evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml b/evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml index 41928df..1b92bb2 100755 --- a/evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml +++ b/evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml @@ -48,6 +48,4 @@ modules: pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: - embedding_model: !ref - paths: - embedding_model: !ref embedding_model.ckpt + embedding_model: !ref \ No newline at end of file diff --git a/evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml b/evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml index 8cf88fd..f889c2a 100644 --- a/evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml +++ b/evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml @@ -51,6 +51,4 @@ modules: pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: - embedding_model: !ref - paths: - embedding_model: !ref embedding_model.ckpt + embedding_model: !ref \ No newline at end of file From 109525a384e56226371a8a0e51281740bd028c91 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 30 Nov 2023 12:03:27 +0100 Subject: [PATCH 10/33] Standardize `SpeakerExtraction` config and instantiation across anonymization and evaluation pipelines --- .../embedding_methods/speechbrain_vectors.py | 12 ++++++++---- .../speaker_embeddings/speaker_extraction.py | 3 ++- anonymization/pipelines/sttts_pipeline.py | 15 ++++++++------- evaluation/privacy/asv/asv.py | 4 ++-- .../utility/voice_distinctiveness/deid_gvd.py | 4 ++-- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py index 83fd2fd..7c3bb18 100644 --- a/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py +++ b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py @@ -11,13 +11,17 @@ class SpeechBrainVectors: 'ecapa': 'spkrec-ecapa-voxceleb' } - def __init__(self, vec_type, device, model_path=None): + def __init__(self, vec_type, device, model_path: Path = None): self.device = device if model_path is not None and model_path.exists(): - model_path = str(Path(model_path).absolute()) - self.extractor = EncoderClassifier.from_hparams(source=model_path, savedir=model_path, - run_opts={'device': self.device}) + model_path = Path(model_path).absolute() + savedir = model_path.parent + self.extractor = EncoderClassifier.from_hparams( + source=str(model_path), + savedir=str(savedir), + run_opts={'device': self.device} + ) else: if model_path is None: model_path = Path('') diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py index d986903..5ce6cfb 100644 --- a/anonymization/modules/speaker_embeddings/speaker_extraction.py +++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py @@ -25,6 +25,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode self.save_intermediate = save_intermediate self.force_compute = force_compute if force_compute else settings.get('force_compute_extraction', False) + self.embed_model_path = settings['embed_model_path'] self.vec_type = settings['vec_type'] self.emb_level = settings['emb_level'] @@ -40,7 +41,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode self.model_hparams = { 'vec_type': self.vec_type, - 'model_path': settings.get('embed_model_path') + 'model_path': self.embed_model_path, } if self.n_processes > 1: diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py index a7f1a6d..53c542a 100644 --- a/anonymization/pipelines/sttts_pipeline.py +++ b/anonymization/pipelines/sttts_pipeline.py @@ -30,25 +30,25 @@ def __init__(self, config, force_compute_all, devices): settings=modules_config['asr'], force_compute=force_compute_all) # Speaker component - self.speaker_extraction = SpeakerExtraction(model_dir=model_dir, devices=devices, + self.speaker_extraction = SpeakerExtraction(devices=devices, save_intermediate=save_intermediate, settings=modules_config['speaker_embeddings'], - force_compute=force_compute_all, + force_compute=force_compute, ) self.speaker_anonymization = SpeakerAnonymization(vectors_dir=vectors_dir, device=devices[0], save_intermediate=save_intermediate, settings=modules_config['speaker_embeddings'], - force_compute=force_compute_all) + force_compute=force_compute) # Prosody component if 'prosody' in modules_config: self.prosody_extraction = ProsodyExtraction(device=devices[0], save_intermediate=save_intermediate, settings=modules_config['prosody'], - force_compute=force_compute_all) + force_compute=force_compute) if 'anonymizer' in modules_config['prosody']: self.prosody_anonymization = ProsodyAnonymization(save_intermediate=save_intermediate, settings=modules_config['prosody'], - force_compute=force_compute_all) + force_compute=force_compute) else: self.prosody_anonymization = None else: @@ -57,7 +57,8 @@ def __init__(self, config, force_compute_all, devices): # TTS component self.speech_synthesis = SpeechSynthesis(devices=[devices[0]], settings=modules_config['tts'], model_dir=model_dir, save_output=config.get('save_output', True), - force_compute=force_compute_all) + force_compute=force_compute, + ) def run_anonymization_pipeline(self, datasets, prepare_results=True): anon_wav_scps = {} @@ -92,7 +93,7 @@ def run_anonymization_pipeline(self, datasets, prepare_results=True): if prepare_results: if self.speaker_anonymization: - anon_vectors_path = self.speaker_anonymization.results_dir, + anon_vectors_path = self.speaker_anonymization.results_dir else: anon_vectors_path = self.speaker_extraction.results_dir now = datetime.strftime(datetime.today(), '%d-%m-%y_%H:%M') diff --git a/evaluation/privacy/asv/asv.py b/evaluation/privacy/asv/asv.py index aea99ef..6792d44 100644 --- a/evaluation/privacy/asv/asv.py +++ b/evaluation/privacy/asv/asv.py @@ -32,8 +32,8 @@ def __init__(self, model_dir, device, score_save_dir, distance='plda', plda_sett self.plda_anon = None self.extractor = SpeakerExtraction(results_dir=self.score_save_dir / 'emb_xvect', - model_dir=model_dir, devices=[self.device], - settings={'vec_type': vec_type, 'emb_level': 'utt'}) + devices=[self.device], + settings={'vec_type': vec_type, 'emb_level': 'utt', 'embed_model_path': model_dir}) def compute_trial_scores(self, trials, enrol_indices, test_indices, out_file, sim_scores): scores = [] diff --git a/evaluation/utility/voice_distinctiveness/deid_gvd.py b/evaluation/utility/voice_distinctiveness/deid_gvd.py index 558eeb7..fb2c9df 100644 --- a/evaluation/utility/voice_distinctiveness/deid_gvd.py +++ b/evaluation/utility/voice_distinctiveness/deid_gvd.py @@ -18,8 +18,8 @@ def __init__(self, spk_ext_model_dir, device, score_save_dir, plda_settings=None vec_type='xvector', num_per_spk='all'): self.num_per_spk = num_per_spk - self.extractor = SpeakerExtraction(results_dir=score_save_dir / 'emb_xvect', model_dir=spk_ext_model_dir, - devices=[device], settings={'vec_type': vec_type, 'emb_level': 'utt'}) + self.extractor = SpeakerExtraction(results_dir=score_save_dir / 'emb_xvect', + devices=[device], settings={'vec_type': vec_type, 'emb_level': 'utt', 'embed_model_path': spk_ext_model_dir}) self.asv = ASV(model_dir=spk_ext_model_dir, device=device, score_save_dir=score_save_dir, distance=distance, plda_settings=plda_settings, vec_type=vec_type) From 628e6637011b5e2a4a50e0d970d22187fbd253fd Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 30 Nov 2023 22:18:22 +0100 Subject: [PATCH 11/33] Add "incomplete config exception" to inform users about TODO entries - Also fixed a duplicate `save_intermediate` entry in `anon_ims_sttts_pc.yaml` --- configs/anon_ims_sttts_pc.yaml | 7 +++---- configs/eval_pre_ecapa_cos.yaml | 4 ++-- utils/config_primitives.py | 10 ++++++++++ 3 files changed, 15 insertions(+), 6 deletions(-) create mode 100644 utils/config_primitives.py diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml index e135243..ff9effe 100644 --- a/configs/anon_ims_sttts_pc.yaml +++ b/configs/anon_ims_sttts_pc.yaml @@ -1,14 +1,13 @@ root_dir : .. -data_dir: !ref /data # TODO adjust path +data_dir: !new:utils.config_primitives.Todo # TODO adjust path, e.g. /data save_intermediate: true save_output: true -results_dir: !ref /results # TODO adjust path -models_dir: !ref /models # TODO adjust path +results_dir: !new:utils.config_primitives.Todo # TODO adjust path, e.g. /results +models_dir: !new:utils.config_primitives.Todo # TODO adjust path, e.g. /models vectors_dir: !ref /results/original_speaker_embeddings force_compute_all: false -save_intermediate: true datasets: !include:datasets_vpc2022_official.yaml pipeline: sttts diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml index 4fa6518..5d11375 100644 --- a/configs/eval_pre_ecapa_cos.yaml +++ b/configs/eval_pre_ecapa_cos.yaml @@ -41,7 +41,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: TODO # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: !new:utils.config_primitives.Todo # TODO path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: @@ -72,7 +72,7 @@ privacy: utility: asr: - libri_dir: TODO # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: !new:utils.config_primitives.Todo # TODO path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_pre # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/utils/config_primitives.py b/utils/config_primitives.py new file mode 100644 index 0000000..dc715ed --- /dev/null +++ b/utils/config_primitives.py @@ -0,0 +1,10 @@ +class IncompleteConfigException(Exception): + pass + + +class Todo: + def __init__(self, *args, **kwargs): + raise IncompleteConfigException( + 'You must complete the config files before execution. ' + 'Find lines with "TODO" and enter the appropriate information.' + ) From 0ff54677ebcd9c747e69364639b2db81e701042b Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Fri, 1 Dec 2023 11:20:05 +0100 Subject: [PATCH 12/33] Fix evaluation config not pointing to dataset .yaml --- configs/eval_pre_ecapa_cos.yaml | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml index 5d11375..ca444ab 100644 --- a/configs/eval_pre_ecapa_cos.yaml +++ b/configs/eval_pre_ecapa_cos.yaml @@ -1,37 +1,7 @@ root_dir: . exp_dir: !ref /exp -datasets: - - name: libri_dev - data: libri - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: libri_test - data: libri - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev_common - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] - - name: vctk_test - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_test_common - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] +datasets: !include:datasets_vpc2022_official.yaml eval_steps: # all metrics in this list will be computed in the evaluation. Remove entry to skip privacy: From 06c11eca90ee56c8f749ea65d3d6833b89527840 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 11:37:05 +0100 Subject: [PATCH 13/33] Fixes to the anonymizer classes - BaseAnonymizer and its descendants are now dumpable - Improved documentation - The configs now use !PLACEHOLDER tag from HyperPyYAML instead of the custom class --- .../anonymization/base_anon.py | 58 ++++- .../anonymization/gan_anon.py | 95 +++++-- .../anonymization/passthrough.py | 44 ++-- .../anonymization/pool_anon.py | 241 +++++++++++++----- .../anonymization/random_anon.py | 98 +++++-- configs/anon/ims_gan.yaml | 9 +- configs/anon/pool.yaml | 24 +- configs/anon_ims_sttts_pc.yaml | 6 +- configs/eval_gvd_both.yaml | 2 +- configs/eval_post_ecapa_cos_ft.yaml | 2 +- configs/eval_post_ecapa_cos_scratch.yaml | 4 +- configs/eval_post_xvector_plda_scratch.yaml | 4 +- configs/eval_pre_ecapa_cos.yaml | 4 +- configs/eval_pre_ecapa_plda.yaml | 4 +- configs/eval_pre_xvector_cos.yaml | 4 +- configs/eval_pre_xvector_plda.yaml | 4 +- utils/config_primitives.py | 10 - 17 files changed, 445 insertions(+), 168 deletions(-) delete mode 100644 utils/config_primitives.py diff --git a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py index 62fb888..753720c 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py @@ -1,26 +1,70 @@ from pathlib import Path import torch +import ruamel.yaml as yaml +from ruamel.yaml.representer import RoundTripRepresenter, SafeRepresenter +from typing import Union class BaseAnonymizer: + """ + Base class for speaker embedding anonymizers, defining the API, + that consists of the following methods: + - anonymize_embeddings + - to + """ + def __init__( + self, + vec_type: str, + device: Union[str, torch.device, int, None], + suffix: str, + **kwargs, + ): + assert suffix[0] == "_", "Suffix must be a string and start with an underscore." - def __init__(self, vec_type='xvector', device=None, **kwargs): # Base class for speaker embedding anonymization. self.vec_type = vec_type - self.suffix = '_anon' + self.suffix = suffix if isinstance(device, torch.device): self.device = device elif isinstance(device, str): self.device = torch.device(device) elif isinstance(device, int): - self.device = torch.device(f'cuda:{device}') + self.device = torch.device(f"cuda:{device}") else: - self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + self.device = ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) - def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): + # ensure dumpability + self.kwargs = kwargs + self.kwargs["vec_type"] = self.vec_type + self.kwargs["device"] = str(self.device) + self.kwargs["suffix"] = self.suffix + + def __repr__(self): + if hasattr(self, "kwargs"): + return f"{self.__class__.__name__}({self.kwargs})" + else: + return f"{self.__class__.__name__}()" + + def to_yaml(self, representer: yaml.Representer): + # first get data into dict format + data = {f"!new:{type(self).__qualname__}": self.kwargs} + return_str = representer.represent_dict(data) + return return_str + + def anonymize_embeddings(self, speaker_embeddings: torch.Tensor, emb_level: str = "spk") -> torch.Tensor: # Template method for anonymizing a dataset. Not implemented. - raise NotImplementedError('anonymize_data') - + raise NotImplementedError("anonymize_data") + def to(self, device): self.device = device + + +# necessary to make BaseAnonymizer and subclasses dumpable +RoundTripRepresenter.add_multi_representer( + BaseAnonymizer, lambda representer, data: data.to_yaml(representer) +) diff --git a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py index f1af431..0fb5757 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py @@ -2,7 +2,9 @@ import torch import numpy as np from scipy.spatial.distance import cosine +from os import PathLike from tqdm import tqdm +from typing import Union from .base_anon import BaseAnonymizer from ..speaker_embeddings import SpeakerEmbeddings @@ -10,14 +12,48 @@ class GANAnonymizer(BaseAnonymizer): + """ + Implementation of the anonymizer proposed in the paper "Anonymizing + speech with generative adversarial networks to preserve speaker + privacy" (https://arxiv.org/pdf/2210.07002.pdf). + """ + def __init__( + self, + vec_type: str = "xvector", + device: Union[str, torch.device, int] = "cuda:0", + model_name: Union[str, PathLike] = None, + vectors_file: Union[str, PathLike] = None, + sim_threshold: float = 0.7, + gan_model_path: Union[str, PathLike] = None, + num_sampled: int = 1000, + save_intermediate: bool = False, + suffix: str = '_anon', + **kwargs, + ): + """ + Args: + vec_type: The type of the speaker embedding to anonymize. Valid + values are 'xvector', 'style-embed', 'ecapa' + device: The computation device to use for the anonymization. + model_name: The filename of the model used for the anonymization. + Defaults to 'gan_{vec_type}'. + vectors_file: The path to the file containing the GAN vectors. + Defaults to 'gan_vectors_{vec_type}.pt'. + sim_threshold: The minimum cosine similarity between the original + speaker embedding and the anonymized embedding. + gan_model_path: The path to the GAN model. + num_sampled: The number of GAN vectors to sample. + save_intermediate: If True, the GAN vectors and the unused indices + will be saved to files. + suffix: The suffix to append to the output files. + """ + super().__init__(vec_type=vec_type, device=device, suffix=suffix) - def __init__(self, vec_type='xvector', device=None, model_name=None, vectors_file=None, sim_threshold=0.7, - gan_model_path=None, num_sampled=1000, save_intermediate=False, **kwargs): - super().__init__(vec_type=vec_type, device=device) - - self.model_name = model_name if model_name else f'gan_{vec_type}' + self.model_name = model_name if model_name else f"gan_{vec_type}" self.vectors_file = Path(vectors_file) - self.unused_indices_file = self.vectors_file.with_name(f'unused_indices_{self.vectors_file.name}') + self.unused_indices_file = self.vectors_file.with_name( + f"unused_indices_{self.vectors_file.name}" + ) self.sim_threshold = sim_threshold self.save_intermediate = save_intermediate self.n = num_sampled @@ -25,17 +61,32 @@ def __init__(self, vec_type='xvector', device=None, model_name=None, vectors_fil if self.vectors_file.is_file(): self.gan_vectors = torch.load(self.vectors_file, map_location=self.device) if self.unused_indices_file.is_file(): - self.unused_indices = torch.load(self.unused_indices_file, map_location='cpu') + self.unused_indices = torch.load( + self.unused_indices_file, map_location="cpu" + ) else: self.unused_indices = np.arange(len(self.gan_vectors)) else: - self.gan_vectors, self.unused_indices = self._generate_artificial_embeddings(gan_model_path, self.n) + ( + self.gan_vectors, + self.unused_indices, + ) = self._generate_artificial_embeddings(gan_model_path, self.n) - def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): - if emb_level == 'spk': - print(f'Anonymize embeddings of {len(speaker_embeddings)} speakers...') - elif emb_level == 'utt': - print(f'Anonymize embeddings of {len(speaker_embeddings)} utterances...') + def anonymize_embeddings( + self, speaker_embeddings: torch.Tensor, emb_level: str = "spk" + ): + """ + Anonymize speaker embeddings using the GAN model. + Args: + speaker_embeddings: [n_embeddings, n_channels] Speaker + embeddings to be anonymized. + emb_level: Embedding level ('spk' for speaker level + or 'utt' for utterance level). + """ + if emb_level == "spk": + print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...") + elif emb_level == "utt": + print(f"Anonymize embeddings of {len(speaker_embeddings)} utterances...") identifiers = [] speakers = [] @@ -51,16 +102,22 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): anon_vectors.append(anon_vec) genders.append(gender) - anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, emb_level=emb_level) - anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), - speakers=speakers, genders=genders) + anon_embeddings = SpeakerEmbeddings( + vec_type=self.vec_type, device=self.device, emb_level=emb_level + ) + anon_embeddings.set_vectors( + identifiers=identifiers, + vectors=torch.stack(anon_vectors, dim=0), + speakers=speakers, + genders=genders, + ) if self.save_intermediate: torch.save(self.unused_indices, self.unused_indices_file) return anon_embeddings - def _generate_artificial_embeddings(self, gan_model_path, n): - print(f'Generate {n} artificial speaker embeddings...') + def _generate_artificial_embeddings(self, gan_model_path: Path, n: int): + print(f"Generate {n} artificial speaker embeddings...") generator = EmbeddingsGenerator(gan_path=gan_model_path, device=self.device) gan_vectors = generator.generate_embeddings(n=n) unused_indices = np.arange(len(gan_vectors)) @@ -70,7 +127,7 @@ def _generate_artificial_embeddings(self, gan_model_path, n): torch.save(unused_indices, self.unused_indices_file) return gan_vectors, unused_indices - def _select_gan_vector(self, spk_vec): + def _select_gan_vector(self, spk_vec: torch.Tensor): i = 0 limit = 20 while i < limit: diff --git a/anonymization/modules/speaker_embeddings/anonymization/passthrough.py b/anonymization/modules/speaker_embeddings/anonymization/passthrough.py index 742071f..d21545d 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/passthrough.py +++ b/anonymization/modules/speaker_embeddings/anonymization/passthrough.py @@ -1,26 +1,36 @@ from .base_anon import BaseAnonymizer import torch +from typing import Union +import ruamel.yaml as yaml -class Passthrough(BaseAnonymizer): - def __init__(self, vec_type='xvector', device=None, **kwargs): - # Base class for speaker embedding anonymization. - self.vec_type = vec_type - self.suffix = '_res' +class Passthrough(BaseAnonymizer): + """ + A 'Passthrough' 'anonymizer' that does not anonymize the speaker embeddings. + """ - if isinstance(device, torch.device): - self.device = device - elif isinstance(device, str): - self.device = torch.device(device) - elif isinstance(device, int): - self.device = torch.device(f'cuda:{device}') - else: - self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + def __init__( + self, + vec_type: str = "", + device: Union[str, torch.device, int] = "cuda:0", + suffix: str = "_res", + **kwargs + ): + super().__init__(vec_type, device, suffix, **kwargs) - def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): - # no need to refer to emb_level, + def anonymize_embeddings( + self, speaker_embeddings: torch.Tensor, emb_level: str = "spk" + ) -> torch.Tensor: + """ + Returns the speaker embeddings unchanged. + """ + # no need to refer to emb_level, # as extractor also yields spk-level or utt-level. return speaker_embeddings - + def to(self, device): - self.device = device + """ + Move the anonymizer to the given device. For the passthrough anonymizer, + this is a no-op, apart from setting the property. + """ + super().to(device) diff --git a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py index 6ef578e..3a21db1 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py @@ -3,6 +3,9 @@ import torch import json from tqdm import tqdm +from typing import Union + +from os import PathLike from sklearn.metrics.pairwise import cosine_distances from sklearn.preprocessing import minmax_scale, StandardScaler @@ -12,42 +15,127 @@ from ..speaker_embeddings import SpeakerEmbeddings from utils import transform_path -REVERSED_GENDERS = {'m': 'f', 'f': 'm'} +REVERSED_GENDERS = { + "m": "f", + "f": "m" +} class PoolAnonymizer(BaseAnonymizer): + """ + An implementation of the 'Pool' anonymization method, that is based on the + primary baseline of the Voice Privacy Challenge 2020. + + For every source x-vector, an anonymized x-vector is computed by finding + the N farthest x-vectors in an external pool (LibriTTS train-other-500) + according to the PLDA distance, and by averaging N∗ randomly selected + vectors among them. In the baseline, we use: + N = 200, + N∗ = 100. + """ + def __init__( + self, + vec_type: str = "xvector", + device: Union[str, torch.device, int, None] = None, + model_name: str = None, + pool_data_dir: Union[str, PathLike] = "data/libritts_train_other_500", + pool_vec_path: Union[str, PathLike] = "original_speaker_embeddings/pool_embeddings", + N: int = 200, + N_star: int = 100, + distance: str = "plda", + cross_gender: bool = False, + proximity: bool = "farthest", + scaling: str = None, + stats_per_dim_path: Union[str, PathLike] = None, + distance_model_path: Union[str, PathLike] = "distances/plda/libritts_train_other_500_xvector", + embed_model_path: Union[str, PathLike] = None, + save_intermediate: bool = False, + suffix: str = "_anon", + **kwargs, + ): + """ + Args: + vec_type (str): Type of the speaker embeddings, currently supported + are 'xvector', 'ecapa', 'style-embed'. + + device (Union[str, torch.device, int, None]): Device to use for + the procedure, e.g. 'cpu', 'cuda', 'cuda:0', etc. + + model_name (str): Name of the model, used for distances that + require a model (e.g., PLDA). + + pool_data_dir (Union[str, PathLike]): Path to the audio data + which will be used for x-vector pool extraction. + + pool_vec_path (Union[str, PathLike]): Path to the stored + speaker embeddings of the pool. + + N (int): Number of most 'fitting' vectors to consider. + + N_star (int): Number of vectors to randomly select from the N most + 'fitting' vectors, to compute the average. + + distance (str): Distance measure, either 'plda' or 'cosine'. + + cross_gender (bool): Whether to switch genders of the speakers + during anonymization. + + proximity (str): Proximity measure, determining which vectors in + the pool are the 'fittest', can be either 'farthest', + 'nearest' or 'center'. - def __init__(self, vec_type='xvector', device=None, model_name=None, pool_data_dir='data/libritts_train_other_500', - pool_vec_path='original_speaker_embeddings/pool_embeddings', N=200, N_star=100, distance='plda', - cross_gender=False, proximity='farthest', scaling=None, stats_per_dim_path=None, - distance_model_path='distances/plda/libritts_train_other_500_xvector', - embed_model_dir=None, save_intermediate=False, **kwargs): - # Pool anonymization method based on the primary baseline of the Voice Privacy Challenge 2020. - # Given a speaker vector, the N most distant vectors in an external speaker pool are extracted, - # and an average of a random subset of N_star vectors is computed and taken as new speaker vector. - # Default distance measure is PLDA. - super().__init__(vec_type=vec_type, device=device) - - self.model_name = model_name if model_name else f'pool_{vec_type}' - - self.N = N # number of most distant vectors to consider - self.N_star = N_star # number of vectors to include in averaged vector - self.proximity = proximity # proximity method, either 'farthest' (distant vectors), 'nearest', or 'closest' - self.cross_gender = cross_gender # Whether to reverse the genders of the speakers + scaling (str): Scaling method to use, can be either 'minmax' or + 'std'. + + stats_per_dim_path (Union[str, PathLike]): Path to the file + containing the statistics for each dimension in the given + embedding type. + + distance_model_path (Union[str, PathLike]): Path to the stored + distance model (required for PLDA). + + embed_model_path (Union[str, PathLike]): Path to the directory + containing the speaker embedding model. + + save_intermediate (bool): Whether to save intermediate results. + + suffix (str): Suffix to append to the output folder names. + + """ + print(locals()) + super().__init__(vec_type=vec_type, device=device, suffix=suffix) + + self.model_name = model_name if model_name else f"pool_{vec_type}" + + self.N = N + self.N_star = N_star + self.proximity = proximity + self.cross_gender = cross_gender self.save_intermediate = save_intermediate # external speaker pool - self.pool_embeddings = self._load_pool_embeddings(pool_data_dir=Path(pool_data_dir), - pool_vec_path=Path(pool_vec_path), - embed_model_dir=Path(embed_model_dir)) - self.pool_genders = {gender: [i for i, spk_gender in enumerate(self.pool_embeddings.genders) - if spk_gender == gender] for gender in set(self.pool_embeddings.genders)} + self.pool_embeddings = self._load_pool_embeddings( + pool_data_dir=Path(pool_data_dir).expanduser(), + pool_vec_path=Path(pool_vec_path).expanduser(), + embed_model_path=Path(embed_model_path).expanduser(), + ) + self.pool_genders = { + gender: [ + i + for i, spk_gender in enumerate(self.pool_embeddings.genders) + if spk_gender == gender + ] + for gender in set(self.pool_embeddings.genders) + } # distance model; PLDA model if distance == plda; None if distance == cosine self.distance = distance # distance measure, either 'plda' or 'cosine' - if self.distance == 'plda': - self.distance_model = PLDAModel(train_embeddings=self.pool_embeddings, - results_path=Path(distance_model_path), save_plda=self.save_intermediate) + if self.distance == "plda": + self.distance_model = PLDAModel( + train_embeddings=self.pool_embeddings, + results_path=Path(distance_model_path), + save_plda=self.save_intermediate, + ) else: self.distance_model = None @@ -55,23 +143,32 @@ def __init__(self, vec_type='xvector', device=None, model_name=None, pool_data_d self.scaling = scaling self.stats_per_dim_path = stats_per_dim_path or Path() - def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_dir): + def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_path): print(pool_data_dir) if pool_vec_path.exists(): - pool_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='spk', device=self.device) + pool_embeddings = SpeakerEmbeddings( + vec_type=self.vec_type, emb_level="spk", device=self.device + ) pool_embeddings.load_vectors(pool_vec_path) else: - extraction_settings = {'vec_type': self.vec_type, 'emb_level': 'spk'} - emb_extractor = SpeakerExtraction(results_dir=pool_vec_path, model_dir=embed_model_dir, device=self.device, - settings=extraction_settings, save_intermediate=self.save_intermediate) - pool_embeddings = emb_extractor.extract_speakers(dataset_path=pool_data_dir, dataset_name='') + extraction_settings = {"vec_type": self.vec_type, "emb_level": "spk", "embed_model_path": embed_model_path} + emb_extractor = SpeakerExtraction( + results_dir=pool_vec_path, + devices=[self.device], + settings=extraction_settings, + save_intermediate=self.save_intermediate, + ) + pool_embeddings = emb_extractor.extract_speakers( + dataset_path=pool_data_dir, dataset_name="" + ) return pool_embeddings - def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): - distance_matrix = self._compute_distances(vectors_a=self.pool_embeddings.vectors, - vectors_b=speaker_embeddings.vectors) + def anonymize_embeddings(self, speaker_embeddings: torch.Tensor, emb_level: str = "spk"): + distance_matrix = self._compute_distances( + vectors_a=self.pool_embeddings.vectors, vectors_b=speaker_embeddings.vectors + ) - print(f'Anonymize embeddings of {len(speaker_embeddings)} speakers...') + print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...") identifiers = [] speakers = [] anon_vectors = [] @@ -83,23 +180,37 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): gender = speaker_embeddings.genders[i] distances_to_speaker = distance_matrix[:, i] candidates = self._get_pool_candidates(distances_to_speaker, gender) - selected_anon_pool = np.random.choice(candidates, self.N_star, replace=False) - anon_vec = torch.mean(self.pool_embeddings.speaker_vectors[selected_anon_pool], dim=0) + selected_anon_pool = np.random.choice( + candidates, self.N_star, replace=False + ) + anon_vec = torch.mean( + self.pool_embeddings.speaker_vectors[selected_anon_pool], dim=0 + ) identifiers.append(identifier) speakers.append(speaker) anon_vectors.append(anon_vec) - genders.append(gender if not self.cross_gender else REVERSED_GENDERS[gender]) + genders.append( + gender if not self.cross_gender else REVERSED_GENDERS[gender] + ) - anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, emb_level=emb_level) - anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), - speakers=speakers, genders=genders) + anon_embeddings = SpeakerEmbeddings( + vec_type=self.vec_type, device=self.device, emb_level=emb_level + ) + anon_embeddings.set_vectors( + identifiers=identifiers, + vectors=torch.stack(anon_vectors, dim=0), + speakers=speakers, + genders=genders, + ) return anon_embeddings def _compute_distances(self, vectors_a, vectors_b): - if self.distance == 'plda': - return 1 - self.distance_model.compute_distance(enrollment_vectors=vectors_a, trial_vectors=vectors_b) - elif self.distance == 'cosine': + if self.distance == "plda": + return 1 - self.distance_model.compute_distance( + enrollment_vectors=vectors_a, trial_vectors=vectors_b + ) + elif self.distance == "cosine": return cosine_distances(X=vectors_a.cpu(), Y=vectors_b.cpu()) else: return [] @@ -110,45 +221,49 @@ def _get_pool_candidates(self, distances, gender): else: distances = distances[self.pool_genders[gender]] - if self.proximity == 'farthest': - return np.argpartition(distances, -self.N)[-self.N:] - elif self.proximity == 'nearest': - return np.argpartition(distances, self.N)[:self.N] - elif self.proximity == 'center': + if self.proximity == "farthest": + return np.argpartition(distances, -self.N)[-self.N :] + elif self.proximity == "nearest": + return np.argpartition(distances, self.N)[: self.N] + elif self.proximity == "center": sorted_distances = np.sort(distances) - return sorted_distances[len(sorted_distances)//2:(len(sorted_distances)//2)+self.N] + return sorted_distances[ + len(sorted_distances) // 2 : (len(sorted_distances) // 2) + self.N + ] def _load_scaling_ranges(self, stats_per_dim_path): if stats_per_dim_path and Path(stats_per_dim_path).exists(): with open(stats_per_dim_path) as f: dim_ranges = json.load(f) - return [(v['min'], v['max']) for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0]))] + return [ + (v["min"], v["max"]) + for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0])) + ] else: - raise FileNotFoundError(f'You need to specify a path to an existing file containing the statistics for ' - f'each dimension in the given embedding type, ' - f'stats_per_dim_path={stats_per_dim_path} is not valid!') + raise FileNotFoundError( + f"You need to specify a path to an existing file containing the statistics for " + f"each dimension in the given embedding type, " + f"stats_per_dim_path={stats_per_dim_path} is not valid!" + ) def _scale_embeddings(self, embeddings): vectors = embeddings.vectors.cpu().numpy() - if self.scaling == 'minmax': + if self.scaling == "minmax": scaling_ranges = self._load_scaling_ranges(self.stats_per_dim_path) scaled_dims = [] for i in range(len(scaling_ranges)): - scaled_dims.append(minmax_scale(vectors[:, i], scaling_ranges[i], axis=0)) + scaled_dims.append( + minmax_scale(vectors[:, i], scaling_ranges[i], axis=0) + ) scaled_vectors = torch.tensor(np.array(scaled_dims)).T.to(self.device) embeddings.vectors = scaled_vectors - elif self.scaling == 'std': + elif self.scaling == "std": std_scaler = StandardScaler() std_scaler.fit(self.pool_embeddings.vectors.cpu().numpy()) scaled_vectors = torch.tensor(std_scaler.transform(vectors)) embeddings.vectors = scaled_vectors return embeddings - -# for every source x-vector, an anonymized x-vector is computed by finding the N farthest x- -# vectors in an external pool (LibriTTS train-other-500) accord- -# ing to the PLDA distance, and by averaging N ∗ randomly se- -# lected vectors among them. In the baseline, we use N = 200 and N ∗ = 100 \ No newline at end of file diff --git a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py index 764e60d..98ffc84 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py @@ -1,6 +1,8 @@ import json from pathlib import Path import torch +from os import PathLike +from typing import Union import numpy as np from .base_anon import BaseAnonymizer @@ -8,13 +10,42 @@ class RandomAnonymizer(BaseAnonymizer): - - def __init__(self, vec_type='xvector', device=None, model_name=None, in_scale=False, stats_per_dim_path=None, - **kwargs): + """ + An anonymizer module that generates random vectors for each speaker or + utterance. The vectors are generated by sampling from a uniform + distribution for each dimension. The range of the uniform distribution + is determined by the minimum and maximum values of the original + speaker/utterance vectors. + """ + def __init__( + self, + device: Union[str, torch.device, int, None], + vec_type: str = "xvector", + model_name: str = None, + in_scale: bool = False, + stats_per_dim_path: Union[str, PathLike] =None, + **kwargs, + ): + """ + Args: + device: The computation device to use for the anonymization. + vec_type: The type of the speaker embedding to anonymize. Valid + values are 'xvector', 'style-embed', 'ecapa' + model_name: The name of the model used for the anonymization. + Defaults to 'random_{vec_type}'. + in_scale: If True, the anonymized vectors will be in the same + scale as the original vectors. Otherwise, the vectors will be + sampled from a uniform distribution with the same range for + each dimension. + stats_per_dim_path: The path to the json file containing the + minimum and maximum values for each dimension of the original + vectors. If None, the stats will be loaded from the file + 'stats_per_dim.json'. + """ super().__init__(vec_type=vec_type, device=device) - self.model_name = model_name if model_name else f'random_{vec_type}' - + self.model_name = model_name if model_name else f"random_{vec_type}" + if in_scale: self.stats_per_dim_path = stats_per_dim_path else: @@ -26,13 +57,23 @@ def scaling_ranges(self): # defer loading of stats until they are first needed # required after anonymizer initialization is delegated to HyperPyYAML if self.stats_per_dim_path is not None: - self._scaling_ranges = self._load_scaling_ranges(stats_per_dim_path=self.stats_per_dim_path) + self._scaling_ranges = self._load_scaling_ranges( + stats_per_dim_path=self.stats_per_dim_path + ) self.stats_per_dim_path = None return self._scaling_ranges - def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): + def anonymize_embeddings(self, speaker_embeddings, emb_level="spk"): + """ + Anonymize speaker embeddings using random vectors. + Args: + speaker_embeddings: [n_embeddings, n_channels] Speaker + embeddings to be anonymized. + emb_level: Embedding level ('spk' for speaker level or 'utt' for + utterance level). + """ if self.scaling_ranges: - print('Anonymize vectors in scale!') + print("Anonymize vectors in scale!") return self._anonymize_data_in_scale(speaker_embeddings) else: identifiers = [] @@ -40,24 +81,38 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): speakers = speaker_embeddings.original_speakers genders = speaker_embeddings.genders for identifier, vector in speaker_embeddings: - mask = torch.zeros(vector.shape[0]).float().random_(-40, 40).to(self.device) + mask = ( + torch.zeros(vector.shape[0]) + .float() + .random_(-40, 40) + .to(self.device) + ) anon_vec = vector * mask identifiers.append(identifier) anon_vectors.append(anon_vec) - anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device) - anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), - genders=genders, speakers=speakers) + anon_embeddings = SpeakerEmbeddings( + vec_type=self.vec_type, device=self.device, emb_level=emb_level + ) + anon_embeddings.set_vectors( + identifiers=identifiers, + vectors=torch.stack(anon_vectors, dim=0), + genders=genders, + speakers=speakers, + ) return anon_embeddings def _load_scaling_ranges(self, stats_per_dim_path): if stats_per_dim_path is None: - stats_per_dim_path = Path('stats_per_dim.json') + stats_per_dim_path = Path("stats_per_dim.json") with open(stats_per_dim_path) as f: dim_ranges = json.load(f) - return [(v['min'], v['max']) for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0]))] + return [ + (v["min"], v["max"]) + for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0])) + ] def _anonymize_data_in_scale(self, speaker_embeddings): identifiers = [] @@ -66,13 +121,18 @@ def _anonymize_data_in_scale(self, speaker_embeddings): genders = speaker_embeddings.genders for identifier, vector in speaker_embeddings: - anon_vec = torch.tensor([np.random.uniform(*dim_range) - for dim_range in self.scaling_ranges]).to(self.device) + anon_vec = torch.tensor( + [np.random.uniform(*dim_range) for dim_range in self.scaling_ranges] + ).to(self.device) identifiers.append(identifier) anon_vectors.append(anon_vec) anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device) - anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), genders=genders, - speakers=speakers) + anon_embeddings.set_vectors( + identifiers=identifiers, + vectors=torch.stack(anon_vectors, dim=0), + genders=genders, + speakers=speakers, + ) - return anon_embeddings \ No newline at end of file + return anon_embeddings diff --git a/configs/anon/ims_gan.yaml b/configs/anon/ims_gan.yaml index 8e19edc..270baaa 100644 --- a/configs/anon/ims_gan.yaml +++ b/configs/anon/ims_gan.yaml @@ -1,9 +1,8 @@ !new:anonymization.modules.speaker_embeddings.anonymization.gan_anon.GANAnonymizer - models_dir: null # overridden by main .yaml - vec_type: null # overriden by main .yaml - save_intermediate: null # overriden by main .yaml - model_name: null # overriden by constructor - vectors_file: !ref /anonymization/style-embed_wgan.pt + models_dir: !PLACEHOLDER # overridden by main .yaml + vec_type: !PLACEHOLDER # overriden by main .yaml + save_intermediate: !PLACEHOLDER # overriden by main .yaml + vectors_file: !ref /anonymization/_wgan.pt gan_model_path: !ref /anonymization/gan_/_wgan.pt num_sampled: 5000 sim_threshold: 0.7 \ No newline at end of file diff --git a/configs/anon/pool.yaml b/configs/anon/pool.yaml index ccdfd8d..5bad248 100644 --- a/configs/anon/pool.yaml +++ b/configs/anon/pool.yaml @@ -1,11 +1,13 @@ -# pool_anon_settings are only used if anon_method == pool -pool_data_dir: !ref /libritts_train_other_500 -pool_vec_path: !ref /style-embed_spk-level/pool_embeddings -N: 200 -N_star: 100 -distance: plda # possible: plda, cosine -plda_dir: !ref /distances/plda/libritts_train_other_500_xvector -cross_gender: false -proximity: farthest # possible: farthest, nearest, center -scaling: maxmin # possible: none, maxmin, mean -stats_per_dim_path: !ref /anonymization/stats_per_dim.json \ No newline at end of file +!new:anonymization.modules.speaker_embeddings.anonymization.pool_anon.PoolAnonymizer + data_dir: !PLACEHOLDER # to be overridden by the main config + embed_model_path: !PLACEHOLDER # to be overridden by the main config + pool_data_dir: !ref /libritts_train_other_500 + pool_vec_path: !ref /style-embed_spk-level/pool_embeddings + N: 200 + N_star: 100 + distance: plda # possible: plda, cosine + plda_dir: !ref /distances/plda/libritts_train_other_500_xvector + cross_gender: false + proximity: farthest # possible: farthest, nearest, center + scaling: maxmin # possible: none, maxmin, mean + stats_per_dim_path: !ref /anonymization/stats_per_dim.json \ No newline at end of file diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml index ff9effe..87760af 100644 --- a/configs/anon_ims_sttts_pc.yaml +++ b/configs/anon_ims_sttts_pc.yaml @@ -1,10 +1,10 @@ root_dir : .. -data_dir: !new:utils.config_primitives.Todo # TODO adjust path, e.g. /data +data_dir: !PLACEHOLDER save_intermediate: true save_output: true -results_dir: !new:utils.config_primitives.Todo # TODO adjust path, e.g. /results -models_dir: !new:utils.config_primitives.Todo # TODO adjust path, e.g. /models +results_dir: !PLACEHOLDER # TODO adjust path, e.g. /results +models_dir: !PLACEHOLDER # TODO adjust path, e.g. /models vectors_dir: !ref /results/original_speaker_embeddings force_compute_all: false diff --git a/configs/eval_gvd_both.yaml b/configs/eval_gvd_both.yaml index 10735bf..c0b8b91 100644 --- a/configs/eval_gvd_both.yaml +++ b/configs/eval_gvd_both.yaml @@ -38,7 +38,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: TODO # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: diff --git a/configs/eval_post_ecapa_cos_ft.yaml b/configs/eval_post_ecapa_cos_ft.yaml index 9e06d69..610d80a 100644 --- a/configs/eval_post_ecapa_cos_ft.yaml +++ b/configs/eval_post_ecapa_cos_ft.yaml @@ -72,7 +72,7 @@ privacy: utility: asr: - libri_dir: TODO # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: !PLACEHOLDER # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_anon_ft # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/configs/eval_post_ecapa_cos_scratch.yaml b/configs/eval_post_ecapa_cos_scratch.yaml index cc9ddae..05aa510 100644 --- a/configs/eval_post_ecapa_cos_scratch.yaml +++ b/configs/eval_post_ecapa_cos_scratch.yaml @@ -41,7 +41,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: TODO # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: @@ -72,7 +72,7 @@ privacy: utility: asr: - libri_dir: TODO # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: !PLACEHOLDER # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_anon_scratch # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/configs/eval_post_xvector_plda_scratch.yaml b/configs/eval_post_xvector_plda_scratch.yaml index e46d9bd..9326102 100644 --- a/configs/eval_post_xvector_plda_scratch.yaml +++ b/configs/eval_post_xvector_plda_scratch.yaml @@ -41,7 +41,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: TODO # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: @@ -72,7 +72,7 @@ privacy: utility: asr: - libri_dir: TODO # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: !PLACEHOLDER # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_anon_scratch # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml index ca444ab..f81a568 100644 --- a/configs/eval_pre_ecapa_cos.yaml +++ b/configs/eval_pre_ecapa_cos.yaml @@ -11,7 +11,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: !new:utils.config_primitives.Todo # TODO path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: @@ -42,7 +42,7 @@ privacy: utility: asr: - libri_dir: !new:utils.config_primitives.Todo # TODO path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: !PLACEHOLDER # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_pre # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/configs/eval_pre_ecapa_plda.yaml b/configs/eval_pre_ecapa_plda.yaml index a9cf2a6..f27d44a 100644 --- a/configs/eval_pre_ecapa_plda.yaml +++ b/configs/eval_pre_ecapa_plda.yaml @@ -41,7 +41,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: TODO # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: @@ -72,7 +72,7 @@ privacy: utility: asr: - libri_dir: TODO # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: !PLACEHOLDER # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_pre # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/configs/eval_pre_xvector_cos.yaml b/configs/eval_pre_xvector_cos.yaml index 4c3c7d2..1c9fc4f 100644 --- a/configs/eval_pre_xvector_cos.yaml +++ b/configs/eval_pre_xvector_cos.yaml @@ -41,7 +41,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: TODO # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: @@ -72,7 +72,7 @@ privacy: utility: asr: - libri_dir: TODO # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: !PLACEHOLDER # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_pre # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/configs/eval_pre_xvector_plda.yaml b/configs/eval_pre_xvector_plda.yaml index 68986fc..c7e3d04 100644 --- a/configs/eval_pre_xvector_plda.yaml +++ b/configs/eval_pre_xvector_plda.yaml @@ -41,7 +41,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: TODO # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: @@ -72,7 +72,7 @@ privacy: utility: asr: - libri_dir: TODO # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: !PLACEHOLDER # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_pre # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/utils/config_primitives.py b/utils/config_primitives.py deleted file mode 100644 index dc715ed..0000000 --- a/utils/config_primitives.py +++ /dev/null @@ -1,10 +0,0 @@ -class IncompleteConfigException(Exception): - pass - - -class Todo: - def __init__(self, *args, **kwargs): - raise IncompleteConfigException( - 'You must complete the config files before execution. ' - 'Find lines with "TODO" and enter the appropriate information.' - ) From 3e902231914ad7fa61f1c707618f6a7d9e1645b5 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 11:37:29 +0100 Subject: [PATCH 14/33] Makefile now downloads and extracts the pretrained models for evaluation too --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 714666d..fa31058 100644 --- a/Makefile +++ b/Makefile @@ -34,14 +34,18 @@ pretrained_models: ## downloads the pretrained models from IMS repositories @wget -q -O models/anonymization.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/anonymization.zip @wget -q -O models/asr.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/asr.zip @wget -q -O models/tts.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/tts.zip + @wget -q -O models/pre_eval_models.zip https://github.com/DigitalPhonetics/VoicePAT/releases/download/v1/pre_eval_models.zip @unzip -oq models/asr.zip -d models @unzip -oq models/tts.zip -d models @unzip -oq models/anonymization.zip -d models + @unzip -oq models/pre_eval_models.zip -d models + @mkdir evaluation/utility/asr/exp + @ln -srf evaluation/utility/asr/exp exp @rm models/*.zip $(ENV_NAME): environment.yaml - @($(CONDA) env create -f $< -p ./$@ && @echo Installation complete, please run `conda-develop .` once.) || $(CONDA) env update -f $< -p ./$@ + @($(CONDA) env create -f $< -p ./$@ && echo Installation complete, please run `conda develop .` once.) || $(CONDA) env update -f $< -p ./$@ @conda config --set env_prompt '($$(basename {default_env})) ' @(cat .gitignore | grep -q $(ENV_NAME)) || echo $(ENV_NAME) >> .gitignore From 77fd42f2fbd7a2117fb6ff3c23dbbfcb2fbdf103 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 11:37:43 +0100 Subject: [PATCH 15/33] Updates to the environment --- environment.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/environment.yaml b/environment.yaml index 058b844..7d9a926 100644 --- a/environment.yaml +++ b/environment.yaml @@ -30,18 +30,20 @@ dependencies: - pyyaml - click - typer + - typer-cli - tqdm - openpyxl - librosa - resampy - python-sounddevice - - pytorch::pytorch-cuda + - pytorch::pytorch-cuda - pytorch::ignite - pytorch::torchaudio + - cudatoolkit-dev - tensorboardx - tensorboard - optuna - hydra-core - typeguard==2.13.3 - conda-build - - torchvision # only to prevent warnings \ No newline at end of file + - torchvision # only to prevent warnings From 714f1e6797944bc62d01b3ae1fde4390409137ed Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 15:50:15 +0100 Subject: [PATCH 16/33] Minor fixes to run_evaluation.sh - Use absolute paths for `asr.sh` invocation - Improved documentation --- run_evaluation.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/run_evaluation.py b/run_evaluation.py index 65213bc..d81ab07 100644 --- a/run_evaluation.py +++ b/run_evaluation.py @@ -3,6 +3,7 @@ from argparse import ArgumentParser from pathlib import Path import pandas as pd +from typing import List parser = ArgumentParser() parser.add_argument('--config', default='config_eval.yaml') @@ -210,15 +211,16 @@ def get_similarity_matrix(vd_model, out_dir, exp_name, segments_folder): print(f'{trial} gvd={gvd_value}') -def asr_train(params, libri_dir, model_name, model_dir, anon_data_suffix): +def asr_train(params: dict, libri_dir: Path, model_name: str, model_dir: Path, anon_data_suffix: str): print(f'Train ASR model: {model_dir}') exp_dir = Path('exp', model_name) + libri_dir = Path(libri_dir).expanduser() # could be relative to userdir ngpu = min(params.get('num_gpus', 0), torch.cuda.device_count()) # cannot use more gpus than available train_params = [ '--lang', 'en', '--ngpu', str(ngpu), - '--expdir', str(exp_dir), + '--expdir', str(exp_dir.absolute()), '--use_lm', 'false', '--nbpe', '5000', '--num_utt', str(params['num_utt']), @@ -233,11 +235,11 @@ def asr_train(params, libri_dir, model_name, model_dir, anon_data_suffix): asr_config = 'conf/train_asr_transformer.yaml' if params.get('anon', False): - local_data_opts = ' '.join([str(libri_dir), str(params['train_data_dir']), anon_data_suffix]) + local_data_opts = ' '.join([str(libri_dir.absolute()), str(params['train_data_dir'].absolute()), anon_data_suffix]) train_set = f'train_clean_360_{anon_data_suffix}' if params.get('finetuning', False) is True: asr_config = 'conf/train_asr_transformer_anon.yaml' - train_params.extend(['--pretrained_model', f'{str(params["pretrained_model"])}/valid.acc.ave.pth']) + train_params.extend(['--pretrained_model', f'{str(params["pretrained_model"].absolute())}/valid.acc.ave.pth']) else: local_data_opts = str(libri_dir) train_set = 'train_clean_360' @@ -255,20 +257,20 @@ def asr_train(params, libri_dir, model_name, model_dir, anon_data_suffix): os.chdir(cwd) -def asr_eval_sh(eval_datasets, eval_data_dir, params, model_path, libri_dir, anon_data_suffix): +def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_path, libri_dir, anon_data_suffix): print(f'Use ASR model for evaluation: {model_path}') test_sets = [] for asr_dataset in eval_datasets: anon_asr_dataset = f'{asr_dataset}_{anon_data_suffix}' - test_sets.append(str(eval_data_dir / asr_dataset)) - test_sets.append(str(eval_data_dir / anon_asr_dataset)) + test_sets.append(str((eval_data_dir / asr_dataset).absolute())) + test_sets.append(str((eval_data_dir / anon_asr_dataset).absolute())) ngpu = min(params.get('num_gpus', 0), torch.cuda.device_count()) # cannot use more gpus than available inference_params = [ '--ngpu', str(ngpu), - '--expdir', str(model_path), + '--expdir', str(model_path.absolute()), '--asr_exp', str(model_path), '--use_lm', 'true', '--local_data_opts', str(libri_dir), @@ -299,6 +301,9 @@ def asr_eval_sh(eval_datasets, eval_data_dir, params, model_path, libri_dir, ano eval_data_dir = params['eval_data_dir'] anon_suffix = params['anon_data_suffix'] + # make sure given paths exist + assert eval_data_dir.exists(), f'{eval_data_dir} does not exist' + if 'privacy' in eval_steps: if 'asv' in eval_steps['privacy']: asv_params = params['privacy']['asv'] From 8a6dd3db29f8159f22dd5d82c6a6d67faf020652 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 18:30:18 +0100 Subject: [PATCH 17/33] Use espnet python package for ASR eval --- evaluation/utility/asr/path.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/evaluation/utility/asr/path.sh b/evaluation/utility/asr/path.sh index 01d36dd..77c850a 100755 --- a/evaluation/utility/asr/path.sh +++ b/evaluation/utility/asr/path.sh @@ -1,4 +1,4 @@ -MAIN_ROOT=~/espnet/ # TODO: change this to the path to your ESPnet installation +MAIN_ROOT=./voicepat_env/lib/python3.11/site-packages/espnet # TODO: change this to the path to your ESPnet installation export PATH=$PWD/utils/:$PATH export LC_ALL=C @@ -8,7 +8,12 @@ if [ -f "${MAIN_ROOT}"/tools/activate_python.sh ]; then else echo "[INFO] "${MAIN_ROOT}"/tools/activate_python.sh is not present" fi + +if [ -f "${MAIN_ROOT}"/tools/extra_path.sh ]; then . "${MAIN_ROOT}"/tools/extra_path.sh +else + echo "[INFO] "${MAIN_ROOT}"/tools/extra_path.sh is not present" +fi export OMP_NUM_THREADS=1 From dd159e1368790d78b7146319310f8914f29f4b76 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 18:34:57 +0100 Subject: [PATCH 18/33] Use tqdm to display synthesis progress --- anonymization/modules/tts/speech_synthesis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anonymization/modules/tts/speech_synthesis.py b/anonymization/modules/tts/speech_synthesis.py index 4ca3ebc..911e68c 100644 --- a/anonymization/modules/tts/speech_synthesis.py +++ b/anonymization/modules/tts/speech_synthesis.py @@ -119,7 +119,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non with Pool(processes=num_processes) as pool: job_params = zip(instances, self.tts_models, repeat(dataset_results_dir), sleeps, repeat(text_is_phones), repeat(self.save_output)) - new_wavs = pool.starmap(synthesis_job, job_params) + new_wavs = pool.starmap(tqdm(synthesis_job), job_params) for new_wav_dict in new_wavs: wavs.update(new_wav_dict) From 035ad5a2b855a7271d2454592dd72261ceeb8b9f Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 18:48:09 +0100 Subject: [PATCH 19/33] Improved documentation for `stts_pipeline.py` --- anonymization/pipelines/sttts_pipeline.py | 182 +++++++++++++++------- 1 file changed, 123 insertions(+), 59 deletions(-) diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py index 53c542a..10f5a1b 100644 --- a/anonymization/pipelines/sttts_pipeline.py +++ b/anonymization/pipelines/sttts_pipeline.py @@ -1,105 +1,169 @@ from pathlib import Path from datetime import datetime -from anonymization.modules import SpeechRecognition, SpeechSynthesis, ProsodyExtraction, ProsodyAnonymization, SpeakerExtraction, \ - SpeakerAnonymization +from anonymization.modules import ( + SpeechRecognition, + SpeechSynthesis, + ProsodyExtraction, + ProsodyAnonymization, + SpeakerExtraction, + SpeakerAnonymization, +) +import typing from utils import prepare_evaluation_data, save_yaml class STTTSPipeline: - """ - This pipeline consists of: - - ASR -> phone sequence - - input - (prosody extractor -> prosody anonymizer) - TTS -> output - - speaker embedding extractor -> speaker anonymizer - - """ - - def __init__(self, config, force_compute_all, devices): + def __init__(self, config: dict, force_compute: bool, devices: list): + """ + Instantiates a STTTSPipeline with the complete feature extraction, + modification and resynthesis. + + This pipeline consists of: + - ASR -> phone sequence - + input - (prosody extr. -> prosody anon.) - TTS -> output + - speaker embedding extr. -> speaker anon. - + + Args: + config (dict): a configuration dictionary, e.g., see anon_ims_sttts_pc.yaml + force_compute (bool): if True, forces re-computation of + all steps. otherwise uses saved results. + devices (list): a list of torch-interpretable devices + """ self.config = config - model_dir = Path(config.get('models_dir', 'models')) - vectors_dir = Path(config.get('vectors_dir', 'original_speaker_embeddings')) - self.results_dir = Path(config.get('results_dir', 'results')) - self.data_dir = Path(config['data_dir']) if 'data_dir' in config else None - save_intermediate = config.get('save_intermediate', True) + model_dir = Path(config.get("models_dir", "models")) + vectors_dir = Path(config.get("vectors_dir", "original_speaker_embeddings")) + self.results_dir = Path(config.get("results_dir", "results")) + self.data_dir = Path(config["data_dir"]) if "data_dir" in config else None + save_intermediate = config.get("save_intermediate", True) - - modules_config = config['modules'] + modules_config = config["modules"] # ASR component - self.speech_recognition = SpeechRecognition(devices=devices, save_intermediate=save_intermediate, - settings=modules_config['asr'], force_compute=force_compute_all) + self.speech_recognition = SpeechRecognition( + devices=devices, + save_intermediate=save_intermediate, + settings=modules_config["asr"], + force_compute=force_compute, + ) # Speaker component - self.speaker_extraction = SpeakerExtraction(devices=devices, - save_intermediate=save_intermediate, - settings=modules_config['speaker_embeddings'], - force_compute=force_compute, - ) - self.speaker_anonymization = SpeakerAnonymization(vectors_dir=vectors_dir, device=devices[0], - save_intermediate=save_intermediate, - settings=modules_config['speaker_embeddings'], - force_compute=force_compute) + self.speaker_extraction = SpeakerExtraction( + devices=devices, + save_intermediate=save_intermediate, + settings=modules_config["speaker_embeddings"], + force_compute=force_compute, + ) + self.speaker_anonymization = SpeakerAnonymization( + vectors_dir=vectors_dir, + device=devices[0], + save_intermediate=save_intermediate, + settings=modules_config["speaker_embeddings"], + force_compute=force_compute, + ) # Prosody component - if 'prosody' in modules_config: - self.prosody_extraction = ProsodyExtraction(device=devices[0], save_intermediate=save_intermediate, - settings=modules_config['prosody'], - force_compute=force_compute) - if 'anonymizer' in modules_config['prosody']: - self.prosody_anonymization = ProsodyAnonymization(save_intermediate=save_intermediate, - settings=modules_config['prosody'], - force_compute=force_compute) + if "prosody" in modules_config: + self.prosody_extraction = ProsodyExtraction( + device=devices[0], + save_intermediate=save_intermediate, + settings=modules_config["prosody"], + force_compute=force_compute, + ) + if "anonymizer" in modules_config["prosody"]: + self.prosody_anonymization = ProsodyAnonymization( + save_intermediate=save_intermediate, + settings=modules_config["prosody"], + force_compute=force_compute, + ) else: self.prosody_anonymization = None else: self.prosody_extraction = None # TTS component - self.speech_synthesis = SpeechSynthesis(devices=[devices[0]], settings=modules_config['tts'], - model_dir=model_dir, save_output=config.get('save_output', True), - force_compute=force_compute, - ) - - def run_anonymization_pipeline(self, datasets, prepare_results=True): + self.speech_synthesis = SpeechSynthesis( + devices=devices, + settings=modules_config["tts"], + model_dir=model_dir, + save_output=config.get("save_output", True), + force_compute=force_compute, + ) + + def run_anonymization_pipeline( + self, + datasets: typing.Dict[str, Path], + prepare_results: bool = True, + ): + """ + Runs the anonymization algorithm on the given datasets. Optionally + prepares the results such that the evaluation pipeline + can interpret them. + + Args: + datasets (dict of str -> Path): The datasets on which the + anonymization pipeline should be runned on. These dataset + will be processed sequentially. + prepare_results (bool): if True, the resulting anonymization + .wavs are prepared for evaluation + """ anon_wav_scps = {} for i, (dataset_name, dataset_path) in enumerate(datasets.items()): - print(f'{i + 1}/{len(datasets)}: Processing {dataset_name}...') + print(f"{i + 1}/{len(datasets)}: Processing {dataset_name}...") # Step 1: Recognize speech, extract speaker embeddings, extract prosody - texts = self.speech_recognition.recognize_speech(dataset_path=dataset_path, dataset_name=dataset_name) - spk_embeddings = self.speaker_extraction.extract_speakers(dataset_path=dataset_path, - dataset_name=dataset_name) + texts = self.speech_recognition.recognize_speech( + dataset_path=dataset_path, dataset_name=dataset_name + ) + spk_embeddings = self.speaker_extraction.extract_speakers( + dataset_path=dataset_path, dataset_name=dataset_name + ) if self.prosody_extraction: - prosody = self.prosody_extraction.extract_prosody(dataset_path=dataset_path, dataset_name=dataset_name, - texts=texts) + prosody = self.prosody_extraction.extract_prosody( + dataset_path=dataset_path, dataset_name=dataset_name, texts=texts + ) else: prosody = None # Step 2: Anonymize speaker, change prosody - anon_embeddings = self.speaker_anonymization.anonymize_embeddings(speaker_embeddings=spk_embeddings,dataset_name=dataset_name) + anon_embeddings = self.speaker_anonymization.anonymize_embeddings( + speaker_embeddings=spk_embeddings, dataset_name=dataset_name + ) if self.prosody_anonymization: - anon_prosody = self.prosody_anonymization.anonymize_prosody(prosody=prosody) + anon_prosody = self.prosody_anonymization.anonymize_prosody( + prosody=prosody + ) else: anon_prosody = prosody # Step 3: Synthesize - wav_scp = self.speech_synthesis.synthesize_speech(dataset_name=dataset_name, texts=texts, - speaker_embeddings=anon_embeddings, - prosody=anon_prosody, emb_level=anon_embeddings.emb_level) + wav_scp = self.speech_synthesis.synthesize_speech( + dataset_name=dataset_name, + texts=texts, + speaker_embeddings=anon_embeddings, + prosody=anon_prosody, + emb_level=anon_embeddings.emb_level, + ) anon_wav_scps[dataset_name] = wav_scp - print('Done') + print("Done") if prepare_results: if self.speaker_anonymization: anon_vectors_path = self.speaker_anonymization.results_dir else: anon_vectors_path = self.speaker_extraction.results_dir - now = datetime.strftime(datetime.today(), '%d-%m-%y_%H:%M') - prepare_evaluation_data(dataset_dict=datasets, anon_wav_scps=anon_wav_scps, - anon_vectors_path=anon_vectors_path, anon_suffix=self.speaker_anonymization.suffix, - output_path=self.results_dir / 'formatted_data' / now) - save_yaml(self.config, self.results_dir / 'formatted_data' / now / 'config.yaml') + now = datetime.strftime(datetime.today(), "%d-%m-%y_%H:%M") + prepare_evaluation_data( + dataset_dict=datasets, + anon_wav_scps=anon_wav_scps, + anon_vectors_path=anon_vectors_path, + anon_suffix=self.speaker_anonymization.suffix, + output_path=self.results_dir / "formatted_data" / now, + ) + save_yaml( + self.config, self.results_dir / "formatted_data" / now / "config.yaml" + ) return anon_wav_scps From f8a36fa3c88367be5f933f9cfd03022401868e72 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 20:03:22 +0100 Subject: [PATCH 20/33] Add conda recipe for SCTK, required for seamlessly installed ESPNet --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index fa31058..2717fe6 100644 --- a/Makefile +++ b/Makefile @@ -17,10 +17,10 @@ endif ##@ INSTALLATION ############################### -install: $(ENV_NAME) ## performs the installation. Currently the only step is to install the conda environment - -espnet: ## installs ESPNet - echo Deactivated +install: $(ENV_NAME) espnet ## performs the installation. Currently the only step is to install the conda environment + @git clone https://github.com/egaznep/sctk + @conda build sctk + @conda install --use-local sctk uninstall: @rm -rf $(ENV_NAME) From 76928c57dfbae926a5ad5a91c5110c34d89c8c71 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 20:03:37 +0100 Subject: [PATCH 21/33] Fix missing entry in `pool.yaml` --- configs/anon/pool.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/anon/pool.yaml b/configs/anon/pool.yaml index 5bad248..b5857d5 100644 --- a/configs/anon/pool.yaml +++ b/configs/anon/pool.yaml @@ -1,6 +1,7 @@ !new:anonymization.modules.speaker_embeddings.anonymization.pool_anon.PoolAnonymizer data_dir: !PLACEHOLDER # to be overridden by the main config embed_model_path: !PLACEHOLDER # to be overridden by the main config + vectors_dir: !PLACEHOLDER # to be overridden by the main config pool_data_dir: !ref /libritts_train_other_500 pool_vec_path: !ref /style-embed_spk-level/pool_embeddings N: 200 From a7439c598ecad996d4b843855425d83d5a4056ad Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 20:25:41 +0100 Subject: [PATCH 22/33] Simplify model creation for `SpeakerExtraction` and `SpeechRecognition` --- .../modules/speaker_embeddings/speaker_extraction.py | 5 +---- anonymization/modules/text/speech_recognition.py | 7 ++----- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py index 5ce6cfb..38faa49 100644 --- a/anonymization/modules/speaker_embeddings/speaker_extraction.py +++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py @@ -44,10 +44,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode 'model_path': self.embed_model_path, } - if self.n_processes > 1: - self.extractors = None - else: - self.extractors = create_extractors(hparams=self.model_hparams, device=self.devices[0]) + self.extractors = [create_extractors(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(n_processes*len(devices)))] def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None): dataset_name = dataset_name if dataset_name is not None else dataset_path.name diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py index bf4dbf6..1a14f45 100644 --- a/anonymization/modules/text/speech_recognition.py +++ b/anonymization/modules/text/speech_recognition.py @@ -33,11 +33,8 @@ def __init__(self, devices, settings, results_dir=None, save_intermediate=True, if self.save_intermediate: raise ValueError('Results dir must be specified in parameters or settings!') - self.asr_model = create_model_instance(hparams=self.model_hparams, device=devices[0]) - self.is_phones = (self.asr_model.output == 'phones') - - if self.n_processes > 1: - self.asr_model = None + self.asr_models = [create_model_instance(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(n_processes*len(devices)))] + self.is_phones = (self.asr_models[0].output == 'phones') def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None): dataset_name = dataset_name if dataset_name else dataset_path.name From 98362ae1a8e9410a30617cee7a394b8d50937d40 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 22:15:18 +0100 Subject: [PATCH 23/33] Fix minor bug 'force_compute_all' not a vaild argument --- run_anonymization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_anonymization.py b/run_anonymization.py index 6c3d0d3..14085d3 100644 --- a/run_anonymization.py +++ b/run_anonymization.py @@ -28,6 +28,6 @@ else: devices.append(torch.device('cpu')) - pipeline = PIPELINES[config['pipeline']](config=config, force_compute_all=args.force_compute, devices=devices) - pipeline.run_anonymization_pipeline(datasets) - + with torch.no_grad(): + pipeline = PIPELINES[config['pipeline']](config=config, force_compute=args.force_compute, devices=devices) + pipeline.run_anonymization_pipeline(datasets) From 27493faedfc3ecd895adec1a29cfc41c4e1fa913 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 22:18:27 +0100 Subject: [PATCH 24/33] Fix minor bug 'cycle' undefined --- anonymization/modules/text/speech_recognition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py index 1a14f45..0b1887e 100644 --- a/anonymization/modules/text/speech_recognition.py +++ b/anonymization/modules/text/speech_recognition.py @@ -2,7 +2,7 @@ from tqdm.contrib.concurrent import process_map import time from torch.multiprocessing import set_start_method -from itertools import repeat +from itertools import cycle, repeat import numpy as np from pathlib import Path From f0eec769d6cc4ef87254688a22abe0e6f4f77568 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 22:20:42 +0100 Subject: [PATCH 25/33] Fix spurious 'n_processes' --- .../modules/speaker_embeddings/speaker_extraction.py | 4 ++-- anonymization/modules/text/speech_recognition.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py index 38faa49..8af9f7a 100644 --- a/anonymization/modules/speaker_embeddings/speaker_extraction.py +++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py @@ -5,7 +5,7 @@ from tqdm.contrib.concurrent import process_map import time from torch.multiprocessing import set_start_method -from itertools import repeat +from itertools import repeat, cycle import numpy as np from .extraction.embedding_methods import SpeechBrainVectors, StyleEmbeddings @@ -44,7 +44,7 @@ def __init__(self, devices: list, settings: dict, results_dir: Path = None, mode 'model_path': self.embed_model_path, } - self.extractors = [create_extractors(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(n_processes*len(devices)))] + self.extractors = [create_extractors(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(len(devices)))] def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None): dataset_name = dataset_name if dataset_name is not None else dataset_path.name diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py index 0b1887e..d186a9d 100644 --- a/anonymization/modules/text/speech_recognition.py +++ b/anonymization/modules/text/speech_recognition.py @@ -33,7 +33,7 @@ def __init__(self, devices, settings, results_dir=None, save_intermediate=True, if self.save_intermediate: raise ValueError('Results dir must be specified in parameters or settings!') - self.asr_models = [create_model_instance(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(n_processes*len(devices)))] + self.asr_models = [create_model_instance(hparams=self.model_hparams, device=device) for device, process in zip(cycle(devices), range(len(devices)))] self.is_phones = (self.asr_models[0].output == 'phones') def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None): From 28e6c580a700bacd2537455e821491dc7ac6505c Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Tue, 19 Dec 2023 22:25:30 +0100 Subject: [PATCH 26/33] Fix speech_recognition --- anonymization/modules/text/speech_recognition.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py index d186a9d..3635426 100644 --- a/anonymization/modules/text/speech_recognition.py +++ b/anonymization/modules/text/speech_recognition.py @@ -71,7 +71,7 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None) start = time.time() if self.n_processes == 1: - new_texts = [recognition_job([utterances, self.asr_model, + new_texts = [recognition_job([utterances, self.asr_models[0], dataset_results_dir, 0, self.devices[0], self.model_hparams, None, save_intermediate])] else: @@ -79,7 +79,7 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None) indices = np.array_split(np.arange(len(utterances)), self.n_processes) utterance_jobs = [[utterances[ind] for ind in chunk] for chunk in indices] # multiprocessing - job_params = zip(utterance_jobs, repeat(self.asr_model), repeat(dataset_results_dir), sleeps, + job_params = zip(utterance_jobs, repeat(self.asr_models), repeat(dataset_results_dir), sleeps, self.devices, repeat(self.model_hparams), list(range(self.n_processes)), repeat(save_intermediate)) new_texts = process_map(recognition_job, job_params, max_workers=self.n_processes) @@ -127,9 +127,6 @@ def recognition_job(data): add_suffix = f'_{job_id}' if job_id is not None else None job_id = job_id or 0 - if asr_model is None: - asr_model = create_model_instance(hparams=model_hparams, device=device) - texts = Text(is_phones=(asr_model.output == 'phones')) i = 0 for utt, spk, wav_path in tqdm(utterances, desc=f'Job {job_id}', leave=True): From 33c3dbf96ec731775ac36aa8ecafd664eeb4045f Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Wed, 20 Dec 2023 09:56:43 +0100 Subject: [PATCH 27/33] Fix speaker extraction --- anonymization/modules/speaker_embeddings/speaker_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py index 8af9f7a..9600f1f 100644 --- a/anonymization/modules/speaker_embeddings/speaker_extraction.py +++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py @@ -78,7 +78,7 @@ def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None): utts = [x[1] for x in returns] utts = list(np.concatenate(utts)) else: - vectors, utts = extraction_job([wav_scp, self.extractors, 0, self.devices[0], self.model_hparams, 0]) + vectors, utts = extraction_job([wav_scp, self.extractors[0], 0, self.devices[0], self.model_hparams, 0]) vectors = torch.stack(vectors, dim=0) speakers = [utt2spk[utt] for utt in utts] From 08c9f4a90864234be405d292b54eb675372ce023 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Wed, 20 Dec 2023 12:32:48 +0100 Subject: [PATCH 28/33] Changes to the environment.yaml and makefile --- Makefile | 8 ++------ environment.yaml | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 2717fe6..2b43415 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ ############################### ## CONFIGURATION ############################### -PHONY: install uninstall pretrained_models espnet +PHONY: install uninstall pretrained_models .ONESHELL: PROJECT_NAME = voicepat @@ -17,14 +17,10 @@ endif ##@ INSTALLATION ############################### -install: $(ENV_NAME) espnet ## performs the installation. Currently the only step is to install the conda environment - @git clone https://github.com/egaznep/sctk - @conda build sctk - @conda install --use-local sctk +install: $(ENV_NAME) ## performs the installation. Currently the only step is to install the conda environment uninstall: @rm -rf $(ENV_NAME) - @rm -rf ESPNet @rm -rf models/ pretrained_models: ## downloads the pretrained models from IMS repositories diff --git a/environment.yaml b/environment.yaml index 7d9a926..8818f7c 100644 --- a/environment.yaml +++ b/environment.yaml @@ -7,7 +7,7 @@ channels: dependencies: - pip - pip: - - speechbrain + - git+https://github.com/speechbrain/HyperPyYAML.git # pip version has a bug - noisereduce - pyloudnorm - phonemizer From d0f6f8436c1ddaf685a15b07e571cd5df645086f Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 21 Dec 2023 12:50:19 +0100 Subject: [PATCH 29/33] Fix pretrained model installation script --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2b43415..4eb9869 100644 --- a/Makefile +++ b/Makefile @@ -34,9 +34,10 @@ pretrained_models: ## downloads the pretrained models from IMS repositories @unzip -oq models/asr.zip -d models @unzip -oq models/tts.zip -d models @unzip -oq models/anonymization.zip -d models - @unzip -oq models/pre_eval_models.zip -d models @mkdir evaluation/utility/asr/exp + @unzip -oq models/pre_eval_models.zip -d evaluation/utility/asr/exp @ln -srf evaluation/utility/asr/exp exp + @cp evaluation/privacy/asv/ @rm models/*.zip From 9df17c7a98727e14d59896fa0ea0a66816265915 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Thu, 21 Dec 2023 15:10:24 +0100 Subject: [PATCH 30/33] Fixes to the config files - Add missing !PLACEHOLDER tags - Define two dataset configs, one for anonymization, one for evaluation (vctk all/common/diff requires separate treatment) --- configs/anon_ims_sttts_pc.yaml | 3 +- ...ml => datasets_vpc2022_official_anon.yaml} | 0 configs/datasets_vpc2022_official_eval.yaml | 30 +++++++++++++++ configs/eval_gvd_both.yaml | 32 +--------------- configs/eval_post_ecapa_cos_ft.yaml | 34 +---------------- configs/eval_post_ecapa_cos_scratch.yaml | 32 +--------------- configs/eval_post_xvector_plda_scratch.yaml | 32 +--------------- configs/eval_pre_ecapa_cos.yaml | 10 ++--- configs/eval_pre_ecapa_plda.yaml | 38 ++----------------- configs/eval_pre_xvector_cos.yaml | 32 +--------------- configs/eval_pre_xvector_plda.yaml | 32 +--------------- 11 files changed, 48 insertions(+), 227 deletions(-) rename configs/{datasets_vpc2022_official.yaml => datasets_vpc2022_official_anon.yaml} (100%) create mode 100644 configs/datasets_vpc2022_official_eval.yaml diff --git a/configs/anon_ims_sttts_pc.yaml b/configs/anon_ims_sttts_pc.yaml index 87760af..f60f8ff 100644 --- a/configs/anon_ims_sttts_pc.yaml +++ b/configs/anon_ims_sttts_pc.yaml @@ -8,7 +8,8 @@ models_dir: !PLACEHOLDER # TODO adjust path, e.g. /models vectors_dir: !ref /results/original_speaker_embeddings force_compute_all: false -datasets: !include:datasets_vpc2022_official.yaml +save_intermediate: true +datasets: !include:datasets_vpc2022_official_anon.yaml pipeline: sttts diff --git a/configs/datasets_vpc2022_official.yaml b/configs/datasets_vpc2022_official_anon.yaml similarity index 100% rename from configs/datasets_vpc2022_official.yaml rename to configs/datasets_vpc2022_official_anon.yaml diff --git a/configs/datasets_vpc2022_official_eval.yaml b/configs/datasets_vpc2022_official_eval.yaml new file mode 100644 index 0000000..cd69244 --- /dev/null +++ b/configs/datasets_vpc2022_official_eval.yaml @@ -0,0 +1,30 @@ + - name: libri_dev + data: libri + set: dev + enrolls: [enrolls] + trials: [trials_f, trials_m] + - name: libri_test + data: libri + set: test + enrolls: [enrolls] + trials: [trials_f, trials_m] + - name: vctk_dev + data: vctk + set: dev + enrolls: [enrolls] + trials: [trials_f, trials_m] + - name: vctk_dev_common + data: vctk + set: dev + enrolls: [enrolls] + trials: [trials_f_common, trials_m_common] + - name: vctk_test + data: vctk + set: test + enrolls: [enrolls] + trials: [trials_f, trials_m] + - name: vctk_test_common + data: vctk + set: test + enrolls: [enrolls] + trials: [trials_f_common, trials_m_common] \ No newline at end of file diff --git a/configs/eval_gvd_both.yaml b/configs/eval_gvd_both.yaml index c0b8b91..a8e1f00 100644 --- a/configs/eval_gvd_both.yaml +++ b/configs/eval_gvd_both.yaml @@ -1,37 +1,7 @@ root_dir: . exp_dir: !ref /exp -datasets: - - name: libri_dev - data: libri - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: libri_test - data: libri - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev_common - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] - - name: vctk_test - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_test_common - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] +datasets: !include:datasets_vpc2022_official_eval.yaml eval_steps: # all metrics in this list will be computed in the evaluation. Remove entry to skip utility: diff --git a/configs/eval_post_ecapa_cos_ft.yaml b/configs/eval_post_ecapa_cos_ft.yaml index 610d80a..d6dc406 100644 --- a/configs/eval_post_ecapa_cos_ft.yaml +++ b/configs/eval_post_ecapa_cos_ft.yaml @@ -1,37 +1,7 @@ root_dir: . exp_dir: !ref /exp -datasets: - - name: libri_dev - data: libri - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: libri_test - data: libri - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev_common - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] - - name: vctk_test - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_test_common - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] +datasets: !include:datasets_vpc2022_official_eval.yaml eval_steps: # all metrics in this list will be computed in the evaluation. Remove entry to skip privacy: @@ -41,7 +11,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: TODO # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: diff --git a/configs/eval_post_ecapa_cos_scratch.yaml b/configs/eval_post_ecapa_cos_scratch.yaml index 05aa510..a3a60a7 100644 --- a/configs/eval_post_ecapa_cos_scratch.yaml +++ b/configs/eval_post_ecapa_cos_scratch.yaml @@ -1,37 +1,7 @@ root_dir: . exp_dir: !ref /exp -datasets: - - name: libri_dev - data: libri - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: libri_test - data: libri - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev_common - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] - - name: vctk_test - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_test_common - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] +datasets: !include:datasets_vpc2022_official_eval.yaml eval_steps: # all metrics in this list will be computed in the evaluation. Remove entry to skip privacy: diff --git a/configs/eval_post_xvector_plda_scratch.yaml b/configs/eval_post_xvector_plda_scratch.yaml index 9326102..61b89e6 100644 --- a/configs/eval_post_xvector_plda_scratch.yaml +++ b/configs/eval_post_xvector_plda_scratch.yaml @@ -1,37 +1,7 @@ root_dir: . exp_dir: !ref /exp -datasets: - - name: libri_dev - data: libri - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: libri_test - data: libri - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev_common - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] - - name: vctk_test - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_test_common - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] +datasets: !include:datasets_vpc2022_official_eval.yaml eval_steps: # all metrics in this list will be computed in the evaluation. Remove entry to skip privacy: diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml index f81a568..672224e 100644 --- a/configs/eval_pre_ecapa_cos.yaml +++ b/configs/eval_pre_ecapa_cos.yaml @@ -1,17 +1,17 @@ root_dir: . exp_dir: !ref /exp -datasets: !include:datasets_vpc2022_official.yaml +datasets: !include:datasets_vpc2022_official_eval.yaml eval_steps: # all metrics in this list will be computed in the evaluation. Remove entry to skip privacy: - asv utility: - asr - - gvd + # - gvd -anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +anon_data_suffix: res # suffix for dataset to signal that it is anonymized +eval_data_dir: results/formatted_data/26-11-23_21:40/ # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: @@ -42,7 +42,7 @@ privacy: utility: asr: - libri_dir: !PLACEHOLDER # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: ~/Projects/2022-voiceprivacychallenge/baseline/corpora/ # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_pre # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/configs/eval_pre_ecapa_plda.yaml b/configs/eval_pre_ecapa_plda.yaml index f27d44a..90363dd 100644 --- a/configs/eval_pre_ecapa_plda.yaml +++ b/configs/eval_pre_ecapa_plda.yaml @@ -1,37 +1,7 @@ root_dir: . exp_dir: !ref /exp -datasets: - - name: libri_dev - data: libri - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: libri_test - data: libri - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev_common - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] - - name: vctk_test - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_test_common - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] +datasets: !include:datasets_vpc2022_official_eval.yaml eval_steps: # all metrics in this list will be computed in the evaluation. Remove entry to skip privacy: @@ -41,7 +11,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - gvd anon_data_suffix: anon # suffix for dataset to signal that it is anonymized -eval_data_dir: !PLACEHOLDER # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. +eval_data_dir: results/formatted_data/26-11-23_21:40/ # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. privacy: asv: @@ -51,7 +21,7 @@ privacy: training: anon: false # true or false, depending on whether the training data for the ASV is anonymized or original train_data_dir: !ref /LibriSpeech/train-clean-360 # path to original or anonymized training data for ASV - train_config: evaluation/privacy/asv_train/hparams/train_ecapa_tdnn_small.yaml + train_config: evaluation/privacy/asv/asv_train/hparams/train_ecapa_tdnn_small.yaml finetuning: false # true (ft) or false (scratch) pretrained_model: null # path to pretrained model, only used for finetuning lr: 0.01 @@ -72,7 +42,7 @@ privacy: utility: asr: - libri_dir: !PLACEHOLDER # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. + libri_dir: ~/Projects/2022-voiceprivacychallenge/baseline/corpora/ # path to parent dir of original LibriSpeech for data preparation, needs the structure /LibriSpeech/LICENSE.TXT etc. model_name: asr_pre # name for ASR model model_dir: !ref / # path to existing ASR model or output for trained ASR model diff --git a/configs/eval_pre_xvector_cos.yaml b/configs/eval_pre_xvector_cos.yaml index 1c9fc4f..a835ff6 100644 --- a/configs/eval_pre_xvector_cos.yaml +++ b/configs/eval_pre_xvector_cos.yaml @@ -1,37 +1,7 @@ root_dir: . exp_dir: !ref /exp -datasets: - - name: libri_dev - data: libri - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: libri_test - data: libri - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev_common - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] - - name: vctk_test - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_test_common - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] +datasets: !include:datasets_vpc2022_official_eval.yaml eval_steps: # all metrics in this list will be computed in the evaluation. Remove entry to skip privacy: diff --git a/configs/eval_pre_xvector_plda.yaml b/configs/eval_pre_xvector_plda.yaml index c7e3d04..8ec1b9f 100644 --- a/configs/eval_pre_xvector_plda.yaml +++ b/configs/eval_pre_xvector_plda.yaml @@ -1,37 +1,7 @@ root_dir: . exp_dir: !ref /exp -datasets: - - name: libri_dev - data: libri - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: libri_test - data: libri - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_dev_common - data: vctk - set: dev - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] - - name: vctk_test - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f, trials_m] - - name: vctk_test_common - data: vctk - set: test - enrolls: [enrolls] - trials: [trials_f_common, trials_m_common] +datasets: !include:datasets_vpc2022_official_eval.yaml eval_steps: # all metrics in this list will be computed in the evaluation. Remove entry to skip privacy: From 8b098729bbc2e4aa5a766d8f949bf1b12e422c2c Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Fri, 22 Dec 2023 10:13:10 +0100 Subject: [PATCH 31/33] Switch to `logging` for print statements --- .../extraction/ims_prosody_extraction.py | 4 +- .../modules/prosody/prosody_extraction.py | 10 +++-- .../anonymization/gan_anon.py | 8 ++-- .../anonymization/pool_anon.py | 7 ++- .../anonymization/random_anon.py | 4 +- .../anonymization/utils/plda_model.py | 6 ++- .../speaker_anonymization.py | 8 ++-- .../speaker_embeddings/speaker_extraction.py | 9 ++-- .../modules/text/speech_recognition.py | 11 ++--- .../InferenceInterfaces/AnonFastSpeech2.py | 4 +- .../modules/tts/IMSToucan/Utility/utils.py | 7 +-- .../modules/tts/IMSToucan/UtteranceCloner.py | 4 +- anonymization/modules/tts/ims_tts.py | 4 +- anonymization/modules/tts/speech_synthesis.py | 11 ++--- anonymization/pipelines/sttts_pipeline.py | 7 ++- evaluation/privacy/asv/asv.py | 6 ++- .../privacy/asv/asv_train/libri_prepare.py | 20 ++++----- evaluation/privacy/asv/metrics/cllr.py | 8 ++-- evaluation/privacy/asv/metrics/linkability.py | 5 ++- .../privacy/asv/metrics/utils/zebra_plots.py | 11 ++--- .../asr/pyscripts/utils/plot_sinc_filters.py | 20 +++++---- .../utility/voice_distinctiveness/deid_gvd.py | 4 +- run_anonymization.py | 3 ++ run_evaluation.py | 43 ++++++++++--------- utils/data_io.py | 4 +- 25 files changed, 136 insertions(+), 92 deletions(-) diff --git a/anonymization/modules/prosody/extraction/ims_prosody_extraction.py b/anonymization/modules/prosody/extraction/ims_prosody_extraction.py index cc700af..7aad79e 100644 --- a/anonymization/modules/prosody/extraction/ims_prosody_extraction.py +++ b/anonymization/modules/prosody/extraction/ims_prosody_extraction.py @@ -1,3 +1,4 @@ +import logging import torch torch.set_num_threads(1) @@ -11,6 +12,7 @@ from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Parselmouth +logger = logging.getLogger(__name__) class ImsProsodyExtractor: @@ -54,7 +56,7 @@ def extract_prosody(self, try: norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave) except ValueError: - print('Something went wrong, the reference wave might be too short.') + logger.error('Something went wrong, the reference wave might be too short.') raise RuntimeError with torch.inference_mode(): diff --git a/anonymization/modules/prosody/prosody_extraction.py b/anonymization/modules/prosody/prosody_extraction.py index 478f574..dcc6c2d 100644 --- a/anonymization/modules/prosody/prosody_extraction.py +++ b/anonymization/modules/prosody/prosody_extraction.py @@ -1,3 +1,4 @@ +import logging import torch torch.set_num_threads(1) @@ -8,6 +9,7 @@ from .extraction import * from utils import read_kaldi_format +logger = logging.getLogger(__name__) class ProsodyExtraction: @@ -47,7 +49,7 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None): wav_scp = {utt: wav_scp[utt] for utt in unprocessed_utts} if wav_scp: - print(f'Extract prosody for {len(wav_scp)} of {len(wav_scp) + len(data_prosody)} utterances') + logger.info(f'Extract prosody for {len(wav_scp)} of {len(wav_scp) + len(data_prosody)} utterances') data_prosody.new = True i = 0 for utt, wav_path in tqdm(wav_scp.items()): @@ -56,7 +58,7 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None): utt_prosody = self.extractor.extract_prosody(transcript=text, ref_audio_path=wav_path, input_is_phones=text_is_phones) except IndexError: - print(f'Index Error for {utt}') + logger.warn(f'IndexError for {utt}') continue duration, pitch, energy, start_silence, end_silence = utt_prosody data_prosody.add_instance(utterance=utt, duration=duration, pitch=pitch, energy=energy, @@ -69,8 +71,8 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None): data_prosody.save_prosody(dataset_results_dir) elif len(data_prosody.utterances) > 0: - print('No prosody extraction necessary; load stored values instead...') + logger.info('No prosody extraction necessary; load stored values instead...') else: - print(f'No utterances could be found in {dataset_path}!') + logger.warn(f'No utterances could be found in {dataset_path}!') return data_prosody diff --git a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py index 0fb5757..5d1ab84 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py @@ -1,3 +1,4 @@ +import logging from pathlib import Path import torch import numpy as np @@ -10,6 +11,7 @@ from ..speaker_embeddings import SpeakerEmbeddings from .utils.WGAN import EmbeddingsGenerator +logger = logging.getLogger(__name__) class GANAnonymizer(BaseAnonymizer): """ @@ -84,9 +86,9 @@ def anonymize_embeddings( or 'utt' for utterance level). """ if emb_level == "spk": - print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...") + logger.info(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...") elif emb_level == "utt": - print(f"Anonymize embeddings of {len(speaker_embeddings)} utterances...") + logger.info(f"Anonymize embeddings of {len(speaker_embeddings)} utterances...") identifiers = [] speakers = [] @@ -117,7 +119,7 @@ def anonymize_embeddings( return anon_embeddings def _generate_artificial_embeddings(self, gan_model_path: Path, n: int): - print(f"Generate {n} artificial speaker embeddings...") + logger.info(f"Generate {n} artificial speaker embeddings...") generator = EmbeddingsGenerator(gan_path=gan_model_path, device=self.device) gan_vectors = generator.generate_embeddings(n=n) unused_indices = np.arange(len(gan_vectors)) diff --git a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py index 3a21db1..c24151f 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py @@ -1,3 +1,4 @@ +import logging from pathlib import Path import numpy as np import torch @@ -15,6 +16,8 @@ from ..speaker_embeddings import SpeakerEmbeddings from utils import transform_path +logger = logging.getLogger(__name__) + REVERSED_GENDERS = { "m": "f", "f": "m" @@ -144,7 +147,7 @@ def __init__( self.stats_per_dim_path = stats_per_dim_path or Path() def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_path): - print(pool_data_dir) + logger.debug(pool_data_dir) if pool_vec_path.exists(): pool_embeddings = SpeakerEmbeddings( vec_type=self.vec_type, emb_level="spk", device=self.device @@ -168,7 +171,7 @@ def anonymize_embeddings(self, speaker_embeddings: torch.Tensor, emb_level: str vectors_a=self.pool_embeddings.vectors, vectors_b=speaker_embeddings.vectors ) - print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...") + logging.info(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...") identifiers = [] speakers = [] anon_vectors = [] diff --git a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py index 98ffc84..f0d2463 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py @@ -1,4 +1,5 @@ import json +import logging from pathlib import Path import torch from os import PathLike @@ -8,6 +9,7 @@ from .base_anon import BaseAnonymizer from ..speaker_embeddings import SpeakerEmbeddings +logger = logging.getLogger(__name__) class RandomAnonymizer(BaseAnonymizer): """ @@ -73,7 +75,7 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level="spk"): utterance level). """ if self.scaling_ranges: - print("Anonymize vectors in scale!") + logger.debug("Anonymize vectors in scale!") return self._anonymize_data_in_scale(speaker_embeddings) else: identifiers = [] diff --git a/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py b/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py index 3abf1fa..18482de 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py +++ b/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py @@ -1,9 +1,11 @@ # This code is based on the descriptions in https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/PLDA_LDA.py +import logging from pathlib import Path from speechbrain.processing.PLDA_LDA import PLDA, StatObject_SB, Ndx, fast_PLDA_scoring import numpy as np import torch +logger = logging.getLogger(__name__) class PLDAModel: def __init__(self, train_embeddings, results_path: Path=None, save_plda=True): @@ -64,13 +66,13 @@ def _train_plda(self, train_embeddings): vectors = train_embeddings.vectors.to(torch.float64) modelset = np.array([f'md{speaker}' for speaker in train_embeddings.original_speakers], dtype="|O") - print(len(modelset), len(set(modelset))) + logger.debug(len(modelset), len(set(modelset))) segset, s, stat0 = self._get_vector_stats(vectors, sg_tag='sg', utt_ids=train_embeddings.get_utt_list()) xvectors_stat = StatObject_SB(modelset=modelset, segset=segset, start=s, stop=s, stat0=stat0, stat1=vectors.cpu().numpy()) - print(vectors.shape) + logger.debug(vectors.shape) plda = PLDA(rank_f=100) plda.plda(xvectors_stat) diff --git a/anonymization/modules/speaker_embeddings/speaker_anonymization.py b/anonymization/modules/speaker_embeddings/speaker_anonymization.py index 5c82113..f156eaf 100644 --- a/anonymization/modules/speaker_embeddings/speaker_anonymization.py +++ b/anonymization/modules/speaker_embeddings/speaker_anonymization.py @@ -1,8 +1,10 @@ +import logging from pathlib import Path from .anonymization.base_anon import BaseAnonymizer from .speaker_embeddings import SpeakerEmbeddings +logger = logging.getLogger(__name__) class SpeakerAnonymization: @@ -38,14 +40,14 @@ def anonymize_embeddings(self, speaker_embeddings, dataset_name): self.force_compute: # if there are already anonymized speaker embeddings from this model and the computation is not forced, # simply load them - print('No computation of anonymized embeddings necessary; load existing anonymized speaker embeddings ' + logger.info('No computation of anonymized embeddings necessary; load existing anonymized speaker embeddings ' 'instead...') anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level=self.emb_level, device=self.device) anon_embeddings.load_vectors(dataset_results_dir) return anon_embeddings else: # otherwise, create new anonymized speaker embeddings - print('Anonymize speaker embeddings...') + logger.info('Anonymize speaker embeddings...') anon_embeddings = self.anonymizer.anonymize_embeddings(speaker_embeddings, emb_level=self.emb_level) if self.save_intermediate: @@ -58,5 +60,5 @@ def _load_anonymizer(self, settings: dict): 'The anonymizer must be an instance of BaseAnonymizer, or a ' \ f'subclass of it, but received an instance of {type(anon_method)}' - print(f'Model type of anonymizer: {type(anon_method).__name__}') + logger.info(f'Model type of anonymizer: {type(anon_method).__name__}') return anon_method diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py index 9600f1f..507781f 100644 --- a/anonymization/modules/speaker_embeddings/speaker_extraction.py +++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py @@ -1,3 +1,4 @@ +import logging from tqdm import tqdm from pathlib import Path import torch @@ -14,7 +15,7 @@ from utils import read_kaldi_format set_start_method('spawn', force=True) - +logger = logging.getLogger(__name__) class SpeakerExtraction: @@ -57,12 +58,12 @@ def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None): speaker_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='utt', device=self.devices[0]) if (dataset_results_dir / 'speaker_vectors.pt').exists() and not self.force_compute: - print('No speaker extraction necessary; load existing embeddings instead...') + logger.info('No speaker extraction necessary; load existing embeddings instead...') speaker_embeddings.load_vectors(dataset_results_dir) # assume the loaded vectors are computed according to the setting in config speaker_embeddings.emb_level = emb_level else: - print(f'Extract embeddings of {len(wav_scp)} utterances') + logger.info(f'Extract embeddings of {len(wav_scp)} utterances') speaker_embeddings.new = True if self.n_processes > 1: @@ -126,7 +127,7 @@ def extraction_job(data): try: spk_embs = [extractor.extract_vector(audio=norm_wave, sr=fs) for extractor in speaker_extractors] except RuntimeError as e: - print(f'Runtime error: {utt}, {signal.shape}, {norm_wave.shape}') + logger.warn(f'Runtime error: {utt}, {signal.shape}, {norm_wave.shape}') continue if len(spk_embs) == 1: diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py index 3635426..8992b8c 100644 --- a/anonymization/modules/text/speech_recognition.py +++ b/anonymization/modules/text/speech_recognition.py @@ -1,6 +1,7 @@ from tqdm import tqdm from tqdm.contrib.concurrent import process_map import time +import logging from torch.multiprocessing import set_start_method from itertools import cycle, repeat import numpy as np @@ -11,7 +12,7 @@ from utils import read_kaldi_format set_start_method('spawn', force=True) - +logger = logging.getLogger(__name__) class SpeechRecognition: @@ -49,13 +50,13 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None) texts.load_text(in_dir=dataset_results_dir) if len(texts) == len(utt2spk): - print('No speech recognition necessary; load existing text instead...') + logger.info('No speech recognition necessary; load existing text instead...') else: if len(texts) > 0: - print(f'No speech recognition necessary for {len(texts)} of {len(utt2spk)} utterances') + logger.info(f'No speech recognition necessary for {len(texts)} of {len(utt2spk)} utterances') # otherwise, recognize the speech dataset_results_dir.mkdir(exist_ok=True, parents=True) - print(f'Recognize speech of {len(utt2spk)} utterances...') + logger.info(f'Recognize speech of {len(utt2spk)} utterances...') wav_scp = read_kaldi_format(dataset_path / 'wav.scp') utterances = [] @@ -86,7 +87,7 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None) end = time.time() total_time = round(end - start, 2) - print(f'Total time for speech recognition: {total_time} seconds ({round(total_time / 60, 2)} minutes / ' + logger.info(f'Total time for speech recognition: {total_time} seconds ({round(total_time / 60, 2)} minutes / ' f'{round(total_time / 60 / 60, 2)} hours)') texts = self._combine_texts(main_text_instance=texts, additional_text_instances=new_texts) diff --git a/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py index 8ecd359..a2ba7c3 100644 --- a/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py +++ b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py @@ -1,5 +1,6 @@ import itertools import os +import logging import librosa.display as lbd import matplotlib.pyplot as plt @@ -15,6 +16,7 @@ from ..Preprocessing.TextFrontend import get_language_id from ..TrainingInterfaces.Spectrogram_to_Embedding.StyleEmbedding import StyleEmbedding +logger = logging.getLogger(__name__) class AnonFastSpeech2(torch.nn.Module): @@ -174,7 +176,7 @@ def read_to_file(self, for (text, durations, pitch, energy) in itertools.zip_longest(text_list, dur_list, pitch_list, energy_list): if text.strip() != "": if not silent: - print("Now synthesizing: {}".format(text)) + logger.info("Now synthesizing: {}".format(text)) if wav is None: if durations is not None: durations = durations.to(self.device) diff --git a/anonymization/modules/tts/IMSToucan/Utility/utils.py b/anonymization/modules/tts/IMSToucan/Utility/utils.py index 5fa60eb..9809d58 100644 --- a/anonymization/modules/tts/IMSToucan/Utility/utils.py +++ b/anonymization/modules/tts/IMSToucan/Utility/utils.py @@ -4,9 +4,10 @@ import os from abc import ABC - +import logging import torch +logger = logging.getLogger(__name__) def cumsum_durations(durations): out = [0] @@ -39,11 +40,11 @@ def get_most_recent_checkpoint(checkpoint_dir, verbose=True): if el.endswith(".pt") and el != "best.pt": checkpoint_list.append(int(el.split(".")[0].split("_")[1])) if len(checkpoint_list) == 0: - print("No previous checkpoints found, cannot reload.") + logger.info("No previous checkpoints found, cannot reload.") return None checkpoint_list.sort(reverse=True) if verbose: - print("Reloading checkpoint_{}.pt".format(checkpoint_list[0])) + logger.info("Reloading checkpoint_{}.pt".format(checkpoint_list[0])) return os.path.join(checkpoint_dir, "checkpoint_{}.pt".format(checkpoint_list[0])) diff --git a/anonymization/modules/tts/IMSToucan/UtteranceCloner.py b/anonymization/modules/tts/IMSToucan/UtteranceCloner.py index 6d05e8e..df02d78 100644 --- a/anonymization/modules/tts/IMSToucan/UtteranceCloner.py +++ b/anonymization/modules/tts/IMSToucan/UtteranceCloner.py @@ -1,3 +1,4 @@ +import logging import soundfile as sf import torch from torch.optim import SGD @@ -10,6 +11,7 @@ from .TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator from .TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Parselmouth +logger = logging.getLogger(__name__) class UtteranceCloner: @@ -59,7 +61,7 @@ def extract_prosody(self, try: norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave) except ValueError: - print('Something went wrong, the reference wave might be too short.') + logger.error('Something went wrong, the reference wave might be too short.') raise RuntimeError with torch.inference_mode(): diff --git a/anonymization/modules/tts/ims_tts.py b/anonymization/modules/tts/ims_tts.py index a5a4cc6..edcabb1 100644 --- a/anonymization/modules/tts/ims_tts.py +++ b/anonymization/modules/tts/ims_tts.py @@ -1,8 +1,10 @@ import torch import resampy +import logging from .IMSToucan.InferenceInterfaces.AnonFastSpeech2 import AnonFastSpeech2 +logger = logging.getLogger(__name__) class ImsTTS: @@ -36,7 +38,7 @@ def read_text(self, text, speaker_embedding, text_is_phones=True, duration=None, if i > 30: break if i > 0: - print(f'Synthesized utt in {i} takes') + logger.info(f'Synthesized utt in {i} takes') # start and end silence are computed for 16000, so we have to adapt this to different output sr factor = self.output_sr // 16000 diff --git a/anonymization/modules/tts/speech_synthesis.py b/anonymization/modules/tts/speech_synthesis.py index 911e68c..17d996a 100644 --- a/anonymization/modules/tts/speech_synthesis.py +++ b/anonymization/modules/tts/speech_synthesis.py @@ -1,6 +1,7 @@ from tqdm import tqdm import soundfile import time +import logging from torch.multiprocessing import Pool, set_start_method from itertools import repeat @@ -8,7 +9,7 @@ from utils import create_clean_dir set_start_method('spawn', force=True) - +logger = logging.getLogger(__name__) class SpeechSynthesis: @@ -52,7 +53,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non if wav_file.stem in texts.utterances} if len(already_synthesized_utts): - print(f'No synthesis necessary for {len(already_synthesized_utts)} of {len(texts)} utterances...') + logger.info(f'No synthesis necessary for {len(already_synthesized_utts)} of {len(texts)} utterances...') texts.remove_instances(list(already_synthesized_utts.keys())) if self.save_output: wavs = already_synthesized_utts @@ -63,7 +64,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non wavs[utt] = wav if texts: - print(f'Synthesize {len(texts)} utterances...') + logger.info(f'Synthesize {len(texts)} utterances...') if self.force_compute or not dataset_results_dir.exists(): create_clean_dir(dataset_results_dir) @@ -84,7 +85,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non utt_prosody_dict = {} instances.append((text, utt, speaker_embedding, utt_prosody_dict)) except KeyError: - print(f'Key error at {utt}') + logger.warn(f'Key error at {utt}') continue wavs.update(synthesis_job(instances=instances, tts_model=self.tts_models[0], out_dir=dataset_results_dir, sleep=0, text_is_phones=text_is_phones, @@ -111,7 +112,7 @@ def synthesize_speech(self, dataset_name, texts, speaker_embeddings, prosody=Non utt_prosody_dict = {} job_instances.append((text, utt, speaker_embedding, utt_prosody_dict)) except KeyError: - print(f'Key error at {utt}') + logger.warn(f'Key error at {utt}') continue instances.append(job_instances) diff --git a/anonymization/pipelines/sttts_pipeline.py b/anonymization/pipelines/sttts_pipeline.py index 10f5a1b..9226ace 100644 --- a/anonymization/pipelines/sttts_pipeline.py +++ b/anonymization/pipelines/sttts_pipeline.py @@ -1,5 +1,6 @@ from pathlib import Path from datetime import datetime +import logging from anonymization.modules import ( SpeechRecognition, @@ -12,6 +13,7 @@ import typing from utils import prepare_evaluation_data, save_yaml +logger = logging.getLogger(__name__) class STTTSPipeline: def __init__(self, config: dict, force_compute: bool, devices: list): @@ -110,7 +112,7 @@ def run_anonymization_pipeline( anon_wav_scps = {} for i, (dataset_name, dataset_path) in enumerate(datasets.items()): - print(f"{i + 1}/{len(datasets)}: Processing {dataset_name}...") + logger.info(f"{i + 1}/{len(datasets)}: Processing {dataset_name}...") # Step 1: Recognize speech, extract speaker embeddings, extract prosody texts = self.speech_recognition.recognize_speech( dataset_path=dataset_path, dataset_name=dataset_name @@ -147,9 +149,10 @@ def run_anonymization_pipeline( emb_level=anon_embeddings.emb_level, ) anon_wav_scps[dataset_name] = wav_scp - print("Done") + logger.info("Anonymization pipeline completed.") if prepare_results: + logger.info("Preparing results according to the Kaldi format.") if self.speaker_anonymization: anon_vectors_path = self.speaker_anonymization.results_dir else: diff --git a/evaluation/privacy/asv/asv.py b/evaluation/privacy/asv/asv.py index 6792d44..66d8080 100644 --- a/evaluation/privacy/asv/asv.py +++ b/evaluation/privacy/asv/asv.py @@ -1,5 +1,6 @@ # This code is partly based on # https://github.com/speechbrain/speechbrain/blob/develop/recipes/VoxCeleb/SpeakerRec/speaker_verification_plda.py +import logging from pathlib import Path import torch from speechbrain.utils.metric_stats import EER @@ -10,6 +11,7 @@ from anonymization.modules.speaker_embeddings import SpeakerExtraction from utils import write_table, read_kaldi_format, save_kaldi_format +logger = logging.getLogger(__name__) class ASV: @@ -138,7 +140,7 @@ def compute_distances(self, enrol_vectors, enrol_ids, test_vectors, test_ids): if self.plda_model_dir.exists(): self.plda = PLDAModel(train_embeddings=None, results_path=self.plda_model_dir) else: - print('Train PLDA model...') + logger.info('Train PLDA model...') plda_data_dir = self.plda_train_data_dir if self.plda_anon: @@ -146,7 +148,7 @@ def compute_distances(self, enrol_vectors, enrol_ids, test_vectors, test_ids): self.select_data_for_plda(all_data_dir=self.plda_train_data_dir, selected_data_dir=self.model_dir.parent, out_dir=plda_data_dir) - print(f'Using data under {plda_data_dir}') + logger.info(f'Using data under {plda_data_dir}') train_dict = self.extractor.extract_speakers(dataset_path=plda_data_dir, emb_level='utt') self.plda = PLDAModel(train_embeddings=train_dict, results_path=self.plda_model_dir) diff --git a/evaluation/privacy/asv/asv_train/libri_prepare.py b/evaluation/privacy/asv/asv_train/libri_prepare.py index 65be867..ab1630e 100644 --- a/evaluation/privacy/asv/asv_train/libri_prepare.py +++ b/evaluation/privacy/asv/asv_train/libri_prepare.py @@ -180,7 +180,7 @@ def _get_utt_split_lists( train_lst = [] dev_lst = [] - print("Getting file list...") + logger.debug("Getting file list...") for data_folder in data_folders: if anon: suffix = 'wav' @@ -212,14 +212,14 @@ def _get_utt_split_lists( selected_spk = {} #select the number of speakers if num_spk != 'ALL': - print("selected %s speakers for training"%num_spk) + logger.debug("selected %s speakers for training"%num_spk) selected_spks_pure = random.sample(spks_pure,int(num_spk)) for k,v in spk_files.items(): if k.split('-')[0] in selected_spks_pure: selected_spk[k] = v #selected_spk = dict(random.sample(spk_files.items(), int(num_spk))) elif num_spk == 'ALL': - print("selected all speakers for training") + logger.debug("selected all speakers for training") selected_spk = spk_files else: sys.exit("invalid $utt_spk value") @@ -228,7 +228,7 @@ def _get_utt_split_lists( if num_utt != 'ALL': # select the number of utterances for each speaker-sess-id if utt_selected_ways == 'spk-sess': - print("selected %s utterances for each selected speaker-sess-id" % num_utt) + logger.info("selected %s utterances for each selected speaker-sess-id" % num_utt) for spk in selected_spk: if len(selected_spk[spk]) >= int(num_utt): selected_list.extend(random.sample(selected_spk[spk], int(num_utt))) @@ -236,7 +236,7 @@ def _get_utt_split_lists( selected_list.extend(selected_spk[spk]) elif utt_selected_ways == 'spk-random': - print("randomly selected %s utterances for each selected speaker-id" % num_utt) + logger.info("randomly selected %s utterances for each selected speaker-id" % num_utt) selected_spks_pure = {} for k, v in selected_spk.items(): spk_pure = k.split('-')[0] @@ -253,7 +253,7 @@ def _get_utt_split_lists( selected_list.extend(selected_spk[spk]) elif utt_selected_ways == 'spk-diverse-sess': - print("diversely selected %s utterances for each selected speaker-id" % num_utt) + logger.info("diversely selected %s utterances for each selected speaker-id" % num_utt) selected_spks_pure = {} for k, v in selected_spk.items(): spk_pure = k.split('-')[0] @@ -273,7 +273,7 @@ def _get_utt_split_lists( elif num_utt == 'ALL': - print("selected all utterances for each selected speaker") + logger.info("selected all utterances for each selected speaker") for value in selected_spk.values(): for v in value: @@ -297,8 +297,8 @@ def _get_utt_split_lists( full = f'Full training set:{full_utt}' used = f'Used for training:{len(selected_list)}' - print(full) - print(used) + logger.debug(full) + logger.debug(used) split = int(0.01 * split_ratio[0] * len(selected_list)) train_snts = selected_list[:split] @@ -417,7 +417,7 @@ def prepare_csv(seg_dur, wav_lst, csv_file, random_segment=False, amp_th=0): ] entry.append(csv_line) - print(f'Skipped {len(problematic_wavs)} invalid audios') + logger.info(f'Skipped {len(problematic_wavs)} invalid audios') csv_output = csv_output + entry # Writing the csv lines diff --git a/evaluation/privacy/asv/metrics/cllr.py b/evaluation/privacy/asv/metrics/cllr.py index c11ba2d..40ba800 100644 --- a/evaluation/privacy/asv/metrics/cllr.py +++ b/evaluation/privacy/asv/metrics/cllr.py @@ -1,9 +1,11 @@ +import logging import numpy as np from scipy.special import expit from .helpers import optimal_llr from .utils.io import read_targets_and_nontargets +logger = logging.getLogger(__name__) def compute_cllr(score_file, key_file, compute_eer=False): # Computing Cllr and min Cllr for binary decision classifiers @@ -15,11 +17,9 @@ def compute_cllr(score_file, key_file, compute_eer=False): else: cllr_min = min_cllr(tar, non) - print("Cllr (min/act): %.3f/%.3f" % (cllr_min, cllr_act)) + logger.info("Cllr (min/act): %.3f/%.3f" % (cllr_min, cllr_act)) if compute_eer: - print("ROCCH-EER: %2.3f%%" % (100*eer)) - - print("") + logger.info("ROCCH-EER: %2.3f%%" % (100*eer)) def cllr(tar_llrs, nontar_llrs): diff --git a/evaluation/privacy/asv/metrics/linkability.py b/evaluation/privacy/asv/metrics/linkability.py index 0d0e057..91529e2 100644 --- a/evaluation/privacy/asv/metrics/linkability.py +++ b/evaluation/privacy/asv/metrics/linkability.py @@ -1,8 +1,10 @@ +import logging import numpy as np from .utils.visualization import draw_linkability_scores from .utils.io import read_targets_and_nontargets +logger = logging.getLogger(__name__) def compute_linkability(score_file, key_file, omega=1.0, use_draw_scores=False, output_file=None): # Computing the global linkability measure for a list of linkage function score @@ -20,8 +22,7 @@ def compute_linkability(score_file, key_file, omega=1.0, use_draw_scores=False, output_file = "linkability_" + score_file draw_linkability_scores(mated_scores, non_mated_scores, Dsys, D, bin_centers, bin_edges, str(output_file)) - print("linkability: %f" % (Dsys)) - print("") + logger.info("linkability: %f" % (Dsys)) def linkability(mated_scores, non_mated_scores, omega=1): diff --git a/evaluation/privacy/asv/metrics/utils/zebra_plots.py b/evaluation/privacy/asv/metrics/utils/zebra_plots.py index 81757d5..a59095a 100644 --- a/evaluation/privacy/asv/metrics/utils/zebra_plots.py +++ b/evaluation/privacy/asv/metrics/utils/zebra_plots.py @@ -1,3 +1,4 @@ +import logging import numpy as np from matplotlib._cm import datad import matplotlib.pyplot as mpl @@ -7,6 +8,7 @@ from .plo_plots import PriorLogOddsPlots from .io import read_targets_and_nontargets +logger = logging.getLogger(__name__) __author__ = "Andreas Nautsch" __email__ = "nautsch@eurecom.fr" @@ -84,10 +86,9 @@ def zebra_framework(plo_plot, scr_path, key_path, label='ZEBRA profile', str_max_abs_llr = '0' # print outs - print('') - print("%s" % label) - print("Population: %s bit" % str_dece) - print("Individual: %s (%s)" % (str_max_abs_llr, cat_tag)) + logger.info("%s" % label) + logger.info("Population: %s bit" % str_dece) + logger.info("Individual: %s (%s)" % (str_max_abs_llr, cat_tag)) # Creating log-odds plots if color_min is not None: @@ -99,7 +100,7 @@ def zebra_framework(plo_plot, scr_path, key_path, label='ZEBRA profile', # DCF if dcf_pot: plo_plot.plot_dcf(color_min=color_min, style_min=style_min, color_act=color_act, style_act=style_act) - print("1 - min Cllr: %.3f (0 is good)" % plo_plot.get_delta_DCF()) + logger.info("1 - min Cllr: %.3f (0 is good)" % plo_plot.get_delta_DCF()) plo_plot.add_legend_entry(legend_entry) diff --git a/evaluation/utility/asr/pyscripts/utils/plot_sinc_filters.py b/evaluation/utility/asr/pyscripts/utils/plot_sinc_filters.py index 6ca071f..8fc34b1 100755 --- a/evaluation/utility/asr/pyscripts/utils/plot_sinc_filters.py +++ b/evaluation/utility/asr/pyscripts/utils/plot_sinc_filters.py @@ -12,6 +12,7 @@ """ import argparse +import logging import sys from pathlib import Path @@ -19,6 +20,7 @@ import numpy as np import torch +logger = logging.getLogger(__name__) def get_parser(): """Construct the parser.""" @@ -141,7 +143,7 @@ def plot_filtergraph( ax.fill_between(x, f_mins, f_maxs, color="green", alpha=0.3) ax.legend(loc="upper left", prop={"size": 15}) plt.savefig(img_path, bbox_inches="tight") - print("Plotted %s" % img_path) + logger.debug("Plotted %s" % img_path) def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args): @@ -154,7 +156,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args): """ from espnet2.layers.sinc_conv import SincConv - print( + logger.warn( "When plotting filter kernels, make sure the script has the" " correct SincConv settings (currently hard-coded)." ) @@ -202,7 +204,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args): img_name = "filter_pre_kernel_%s.%s" % (str(i).zfill(2), args.filetype) img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") - print("Plotted %s" % img_path) + logger.debug("Plotted %s" % img_path) kernel = kernels[i][0] plt.clf() @@ -212,7 +214,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args): img_name = "filter_kernel_%s.%s" % (str(i).zfill(2), args.filetype) img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") - print("Plotted %s" % img_path) + logger.debug("Plotted %s" % img_path) plt.clf() plt.xlabel("kernel index") @@ -221,7 +223,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args): img_name = "filter_kernel_both_%s.%s" % (str(i).zfill(2), args.filetype) img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") - print("Plotted %s" % img_path) + logger.debug("Plotted %s" % img_path) y = np.zeros_like(x_f) y[F_mins[i] : F_maxs[i]] = 1.0 @@ -230,7 +232,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args): img_name = "filter_freq_%s.%s" % (str(i).zfill(2), args.filetype) img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") - print("Plotted %s" % img_path) + logger.debug("Plotted %s" % img_path) pre_y = np.zeros_like(x_f) pre_y[pre_F_mins[i] : pre_F_maxs[i]] = 1.0 @@ -240,7 +242,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args): img_name = "filter_freq_both_%s.%s" % (str(i).zfill(2), args.filetype) img_path = args.out_folder / img_name plt.savefig(img_path, bbox_inches="tight") - print("Plotted %s" % img_path) + logger.debug("Plotted %s" % img_path) plt.clf() filters = [32, 71, 113, 126] @@ -259,7 +261,7 @@ def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args): img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") plt.close(fig) - print("Plotted %s" % img_path) + logger.debug("Plotted %s" % img_path) def plot_filters(indices, filename, F_mins, F_maxs, output_folder): @@ -282,7 +284,7 @@ def plot_filters(indices, filename, F_mins, F_maxs, output_folder): plt.plot(x, y) img_path = str(output_folder / filename) plt.savefig(img_path, bbox_inches="tight") - print("Plotted %s" % img_path) + logger.debug("Plotted %s" % img_path) def main(argv): diff --git a/evaluation/utility/voice_distinctiveness/deid_gvd.py b/evaluation/utility/voice_distinctiveness/deid_gvd.py index fb2c9df..6733756 100644 --- a/evaluation/utility/voice_distinctiveness/deid_gvd.py +++ b/evaluation/utility/voice_distinctiveness/deid_gvd.py @@ -1,3 +1,4 @@ +import logging from pathlib import Path import numpy as np import pandas as pd @@ -11,6 +12,7 @@ from evaluation.privacy import ASV from evaluation.privacy.asv.metrics.helpers import optimal_llr +logger = logging.getLogger(__name__) class VoiceDistinctiveness: @@ -98,7 +100,7 @@ def _select_utterances(self, spk2utt_x, spk2utt_y): y = [(spk, utt) for spk, utt_list in spk2utt_y.items() for utt in utt_list] else: - print("choose %d utterances for each spk to create trial" % int(self.num_per_spk)) + logger.info("choose %d utterances for each spk to create trial" % int(self.num_per_spk)) x = [(spk, utt) for spk, utt_list in spk2utt_x.items() for utt in random.sample(utt_list, k=min(self.num_per_spk, len(utt_list)))] y = [(spk, utt) for spk, utt_list in spk2utt_y.items() diff --git a/run_anonymization.py b/run_anonymization.py index 14085d3..49cf865 100644 --- a/run_anonymization.py +++ b/run_anonymization.py @@ -1,3 +1,4 @@ +import logging from pathlib import Path from argparse import ArgumentParser import torch @@ -29,5 +30,7 @@ devices.append(torch.device('cpu')) with torch.no_grad(): + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s- %(levelname)s - %(message)s') + logging.info(f'Running pipeline: {config["pipeline"]}') pipeline = PIPELINES[config['pipeline']](config=config, force_compute=args.force_compute, devices=devices) pipeline.run_anonymization_pipeline(datasets) diff --git a/run_evaluation.py b/run_evaluation.py index d81ab07..13b4976 100644 --- a/run_evaluation.py +++ b/run_evaluation.py @@ -1,4 +1,5 @@ # We need to set CUDA_VISIBLE_DEVICES before we import Pytorch so we will read all arguments directly on startup +import logging import os from argparse import ArgumentParser from pathlib import Path @@ -74,7 +75,7 @@ def find_asv_model_checkpoint(model_dir): def asv_train(train_params, output_dir): - print(f'Train ASV model: {output_dir}') + logging.info(f'Train ASV model: {output_dir}') hparams = { 'pretrained_path': str(train_params['pretrained_model']), 'batch_size': train_params['batch_size'], @@ -103,7 +104,7 @@ def asv_train(train_params, output_dir): def asv_eval(eval_datasets, eval_data_dir, params, device, anon_data_suffix, model_dir=None): model_dir = model_dir or find_asv_model_checkpoint(params['model_dir']) - print(f'Use ASV model for evaluation: {model_dir}') + logging.info(f'Use ASV model for evaluation: {model_dir}') save_dir = params['evaluation']['results_dir'] / f'{params["evaluation"]["distance"]}_out' asv = ASV(model_dir=model_dir, device=device, score_save_dir=save_dir, distance=params['evaluation']['distance'], @@ -121,7 +122,7 @@ def asv_eval(eval_datasets, eval_data_dir, params, device, anon_data_suffix, mod EER = asv.eer_compute(enrol_dir=eval_data_dir / enroll_name, test_dir=eval_data_dir / test_name, trial_runs_file=eval_data_dir / trial / 'trials') - print(f'{enroll_name}-{test_name}: {scenario.upper()}-EER={EER}') + logging.info(f'{enroll_name}-{test_name}: {scenario.upper()}-EER={EER}') trials_info = trial.split('_') gender = trials_info[3] if 'common' in trial: @@ -131,7 +132,7 @@ def asv_eval(eval_datasets, eval_data_dir, params, device, anon_data_suffix, mod 'trial': 'original' if scenario[1] == 'o' else 'anon', 'EER': round(EER * 100, 3)}) results_df = pd.DataFrame(results) - print(results_df) + logging.info(results_df) results_df.to_csv(save_dir / 'results.csv') @@ -164,7 +165,7 @@ def get_similarity_matrix(vd_model, out_dir, exp_name, segments_folder): **vd_settings) vd_orig, vd_anon = None, None save_dir_orig, save_dir_anon = None, None - print(f'Use ASV model {spk_ext_model_dir} for computing voice similarities of original and anonymized speakers') + logging.info(f'Use ASV model {spk_ext_model_dir} for computing voice similarities of original and anonymized speakers') elif 'orig_model_dir' in params['asv_params'] and 'anon_model_dir' in params['asv_params']: # use different ASV models for original and anon speaker spaces spk_ext_model_dir_orig = find_asv_model_checkpoint(params['asv_params']['orig_model_dir']) @@ -176,7 +177,7 @@ def get_similarity_matrix(vd_model, out_dir, exp_name, segments_folder): vd_anon = VoiceDistinctiveness(spk_ext_model_dir=spk_ext_model_dir_anon, score_save_dir=save_dir_anon, **vd_settings) vd = None - print(f'Use ASV model {spk_ext_model_dir_orig} for computing voice similarities of original speakers and ASV ' + logging.info(f'Use ASV model {spk_ext_model_dir_orig} for computing voice similarities of original speakers and ASV ' f'model {spk_ext_model_dir_anon} for voice similarities of anonymized speakers') else: raise ValueError('GVD: You either need to specify one "model_dir" for both original and anonymized data or ' @@ -208,11 +209,11 @@ def get_similarity_matrix(vd_model, out_dir, exp_name, segments_folder): gvd_value = vd.gvd(oo_sim, pp_sim) if vd else vd_orig.gvd(oo_sim, pp_sim) with open(trial_out_dir / 'gain_of_voice_distinctiveness', 'w') as f: f.write(str(gvd_value)) - print(f'{trial} gvd={gvd_value}') + logging.info(f'{trial} gvd={gvd_value}') def asr_train(params: dict, libri_dir: Path, model_name: str, model_dir: Path, anon_data_suffix: str): - print(f'Train ASR model: {model_dir}') + logging.info(f'Train ASR model: {model_dir}') exp_dir = Path('exp', model_name) libri_dir = Path(libri_dir).expanduser() # could be relative to userdir ngpu = min(params.get('num_gpus', 0), torch.cuda.device_count()) # cannot use more gpus than available @@ -250,7 +251,7 @@ def asr_train(params: dict, libri_dir: Path, model_name: str, model_dir: Path, a cwd = Path.cwd() os.chdir('evaluation/utility/asr') # espnet recipe needs several files at specific relative positions - print(Path.cwd()) + logging.debug(Path.cwd()) subprocess.run(['./asr.sh'] + train_params) subprocess.run(['ln', '-srf', exp_dir, model_dir]) @@ -258,7 +259,7 @@ def asr_train(params: dict, libri_dir: Path, model_name: str, model_dir: Path, a def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_path, libri_dir, anon_data_suffix): - print(f'Use ASR model for evaluation: {model_path}') + logging.info(f'Use ASR model for evaluation: {model_path}') test_sets = [] for asr_dataset in eval_datasets: @@ -292,6 +293,8 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat if __name__ == '__main__': + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s- %(levelname)s - %(message)s') + params = parse_yaml(Path('configs', args.config)) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') @@ -312,9 +315,9 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat asv_train_params = asv_params['training'] if not model_dir.exists() or asv_train_params.get('retrain', True) is True: start_time = time.time() - print('Perform ASV training') + logging.info('Perform ASV training') asv_train(train_params=asv_train_params, output_dir=asv_params['model_dir']) - print("ASV training time: %f min ---" % (float(time.time() - start_time) / 60)) + logging.info("ASV training time: %f min ---" % (float(time.time() - start_time) / 60)) model_dir = scan_checkpoint(model_dir, 'CKPT') if asv_params['vec_type'] == 'xvector': shutil.copy('evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml', model_dir) @@ -322,11 +325,11 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat shutil.copy('evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml', model_dir) if 'evaluation' in asv_params: - print('Perform ASV evaluation') + logging.info('Perform ASV evaluation') start_time = time.time() asv_eval(eval_datasets=eval_data_trials, eval_data_dir=eval_data_dir, params=asv_params, device=device, model_dir=model_dir, anon_data_suffix=anon_suffix) - print("--- EER computation time: %f min ---" % (float(time.time() - start_time) / 60)) + logging.info("--- EER computation time: %f min ---" % (float(time.time() - start_time) / 60)) if 'utility' in eval_steps: if 'asr' in eval_steps['utility']: @@ -344,10 +347,10 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat if not model_dir.exists() or asr_train_params.get('retrain', True) is True: start_time = time.time() - print('Perform ASR training') + logging.info('Perform ASR training') asr_train(params=asr_train_params, libri_dir=libri_dir, model_name=model_name, model_dir=model_dir, anon_data_suffix=anon_suffix) - print("--- ASR training time: %f min ---" % (float(time.time() - start_time) / 60)) + logging.info("--- ASR training time: %f min ---" % (float(time.time() - start_time) / 60)) if 'evaluation' in asr_params: asr_eval_params = asr_params['evaluation'] @@ -358,15 +361,15 @@ def asr_eval_sh(eval_datasets: List[str], eval_data_dir: Path, params, model_pat asr_model_path = model_dir / 'asr_train_asr_transformer_raw_en_bpe5000' start_time = time.time() - print('Perform ASR evaluation') + logging.info('Perform ASR evaluation') asr_eval_sh(eval_datasets=eval_data_asr, eval_data_dir=eval_data_dir, params=asr_eval_params, model_path=asr_model_path, anon_data_suffix=anon_suffix, libri_dir=libri_dir) - print("--- ASR evaluation time: %f min ---" % (float(time.time() - start_time) / 60)) + logging.info("--- ASR evaluation time: %f min ---" % (float(time.time() - start_time) / 60)) if 'gvd' in eval_steps['utility']: gvd_params = params['utility']['gvd'] start_time = time.time() - print('Perform GVD evaluation') + logging.info('Perform GVD evaluation') gvd_eval(eval_datasets=eval_data_trials, eval_data_dir=eval_data_dir, params=gvd_params, device=device, anon_data_suffix=anon_suffix) - print("--- GVD computation time: %f min ---" % (float(time.time() - start_time) / 60)) + logging.info("--- GVD computation time: %f min ---" % (float(time.time() - start_time) / 60)) diff --git a/utils/data_io.py b/utils/data_io.py index 0bd03ee..4dabab0 100644 --- a/utils/data_io.py +++ b/utils/data_io.py @@ -2,7 +2,9 @@ from hyperpyyaml import load_hyperpyyaml, dump_hyperpyyaml import json import pandas as pd +import logging +logger = logging.getLogger(__name__) def read_kaldi_format(filename, return_as_dict=True, values_as_string=False): key_list = [] @@ -56,7 +58,7 @@ def save_kaldi_format(data, filename): #value = value.encode('utf-8') f.write(f'{key} {value}\n') except UnicodeEncodeError: - print(f'{key} {value}') + logger.error(f'{key} {value}') raise From d7f08778bc6bc4bec3df4d23015078169f7cdf99 Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Fri, 22 Dec 2023 10:30:59 +0100 Subject: [PATCH 32/33] Fix GVD accidentally disabled on one of the configs --- configs/eval_pre_ecapa_cos.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/eval_pre_ecapa_cos.yaml b/configs/eval_pre_ecapa_cos.yaml index 672224e..bf9ec9e 100644 --- a/configs/eval_pre_ecapa_cos.yaml +++ b/configs/eval_pre_ecapa_cos.yaml @@ -8,7 +8,7 @@ eval_steps: # all metrics in this list will be computed in the evaluation. Remo - asv utility: - asr - # - gvd + - gvd anon_data_suffix: res # suffix for dataset to signal that it is anonymized eval_data_dir: results/formatted_data/26-11-23_21:40/ # path to anonymized evaluation data in kaldi format, e.g. /libri_test_enrolls/wav.scp etc. From a68c555c6f0e8bd84793c1cf44ef0dcf587fea5f Mon Sep 17 00:00:00 2001 From: Unal Ege Gaznepoglu Date: Fri, 22 Dec 2023 14:26:04 +0100 Subject: [PATCH 33/33] Dump settings of the pool anonymizer --- .../modules/speaker_embeddings/anonymization/pool_anon.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py index c24151f..6a822bd 100644 --- a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py +++ b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py @@ -105,8 +105,11 @@ def __init__( suffix (str): Suffix to append to the output folder names. """ - print(locals()) - super().__init__(vec_type=vec_type, device=device, suffix=suffix) + # filter out kwargs that are not used in this class + kwargs = locals() + kwargs.pop("self") + kwargs.pop("__class__") + super().__init__(**kwargs) self.model_name = model_name if model_name else f"pool_{vec_type}"