Merge branch 'main' into speechbrain_asr_eval

# Conflicts: # README.md # anonymization/modules/speaker_embeddings/anonymization/gan_anon.py # anonymization/modules/speaker_embeddings/anonymization/pool_anon.py # anonymization/modules/speaker_embeddings/speaker_extraction.py # anonymization/pipelines/sttts_pipeline.py # configs/anon_ims_sttts_pc.yaml # evaluation/privacy/asv/asv.py # evaluation/utility/voice_distinctiveness/deid_gvd.py # run_evaluation.py
DigitalPhonetics · Dec 24, 2023 · ddb1d80 · ddb1d80
2 parents 7da637f + 3a0c3b7
commit ddb1d80
Show file tree

Hide file tree

Showing 54 changed files with 1,092 additions and 626 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,54 @@
+###############################
+## CONFIGURATION
+###############################
+PHONY: install uninstall pretrained_models
+.ONESHELL:
+
+PROJECT_NAME = voicepat
+ENV_NAME = $(PROJECT_NAME)_env
+
+ifeq (, $(shell mamba --version))
+CONDA = conda
+else
+CONDA = mamba
+endif
+
+###############################
+##@ INSTALLATION
+###############################
+
+install: $(ENV_NAME) ## performs the installation. Currently the only step is to install the conda environment
+
+uninstall:
+	@rm -rf $(ENV_NAME)
+	@rm -rf models/
+
+pretrained_models: ## downloads the pretrained models from IMS repositories
+	@echo Downloading models from IMS repositories
+	@rm -rf models
+	@mkdir -p models
+	@wget -q -O models/anonymization.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/anonymization.zip
+	@wget -q -O models/asr.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/asr.zip
+	@wget -q -O models/tts.zip https://github.com/DigitalPhonetics/speaker-anonymization/releases/download/v2.0/tts.zip
+	@wget -q -O models/pre_eval_models.zip https://github.com/DigitalPhonetics/VoicePAT/releases/download/v1/pre_eval_models.zip
+	@unzip -oq models/asr.zip -d models
+	@unzip -oq models/tts.zip -d models
+	@unzip -oq models/anonymization.zip -d models
+	@mkdir evaluation/utility/asr/exp
+	@unzip -oq models/pre_eval_models.zip -d evaluation/utility/asr/exp
+	@ln -srf evaluation/utility/asr/exp exp
+	@cp evaluation/privacy/asv/
+	@rm models/*.zip
+
+
+$(ENV_NAME): environment.yaml
+	@($(CONDA) env create -f $< -p ./$@ && echo Installation complete, please run `conda develop .` once.) || $(CONDA) env update -f $< -p ./$@
+	@conda config --set env_prompt '($$(basename {default_env})) '
+	@(cat .gitignore | grep -q $(ENV_NAME)) || echo $(ENV_NAME) >> .gitignore
+
+###############################
+##@ SELF-DOCUMENTING COMMAND
+###############################
+
+help:  ## Display this help
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
diff --git a/README.md b/README.md
@@ -1,11 +1,10 @@
 # [VoicePAT: Voice Privacy Anonymization Toolkit](http://arxiv.org/abs/2309.08049)
 
 **Note: This repository and its documentation are still under construction but can already be used for both 
-anonymization and evaluation. We welcome all contributions to introduce more generation methods or evaluation metrics to the VoicPAT framework. 
+anonymization and evaluation. We welcome all contributions to introduce more generation methods or evaluation metrics to the VoicePAT framework. 
 If you are interested in contributing, please leave comments on a GitHub issue.**
 
-VoicePAT is a toolkit for speaker anonymization research, with special focus on speaker anonymization. 
-It is based on the framework(s) by the [VoicePrivacy Challenges](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022) but contains the following improvements:
+VoicePAT is a toolkit for speaker anonymization research. It is based on the framework(s) by the [VoicePrivacy Challenges](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022) but contains the following improvements:
 
 * It consists of **two separate procedures for anonymization and evaluation**. This means that the generation of 
   anonymized speech is independent of the evaluation of anonymization systems. Both processes do not need to be 
@@ -26,27 +25,36 @@ It is based on the framework(s) by the [VoicePrivacy Challenges](https://github.
 
 
 ## Installation
-Simply clone the repository and install the dependencies in [requirements.txt](requirements.txt). If you want to use 
-the ESPnet-based ASR evaluation model, you additionally need to clone and install [ESPNet](https://github.com/espnet/espnet/) and insert the link to 
-it in [evaluation/utility/asr/path.sh](evaluation/utility/asr/espnet_asr/path.sh), e.g., ``MAIN_ROOT=~/espnet``.
+
+Requires `conda` for environment management. Installation of `mamba` is also recommended for speeding up the environment-related tasks. Simply clone the repository and run the following commands, a conda environment will be generated in the project root folder and the pretrained models will be downloaded.
+
+```bash
+sudo apt install libespeak-ng   # alternatively use your own package manager
+make install pretrained_models  
+```
+
+The datasets have to be downloaded via the VoicePrivacy Challenge framework. Once the download is complete, the `.scp` files need to be converted to the absolute path, because they are relative to the challenge folder. Use [utils/relative_scp_to_abs.py](utils/relative_scp_to_abs.py) for this purpose. Then simply point `data_path` in the YAML configurations to the data folder of the VoicePrivacy Challenge framework.
+
+If you want to use the ESPnet-based ASR evaluation model, you additionally need to clone and install [ESPNet](https://github.com/espnet/espnet/) and insert the link to it in [evaluation/utility/asr/path.sh](evaluation/utility/asr/path.sh), e.g., ``MAIN_ROOT=~/espnet``.
 
 ## Usage
 
 ![](figures/framework.png)
 
-For using the toolkit with the existing methods, you can use the configuration files in [configs](configs). You can 
-also add more modules and models to the code and create your own config by using the existing ones as template.
-
+For using the toolkit with the existing methods, you can use the configuration files in [configs](configs). You can also add more modules and models to the code and create your own config by using the existing ones as template. The configuration files use HyperPyYAML syntax, for which a useful reference is available [here](https://colab.research.google.com/drive/1Pg9by4b6-8QD2iC0U7Ic3Vxq4GEwEdDz?usp=sharing).
 
 ### Anonymization
+
 The framework currently contains only one pipeline and config for anonymization, [anon_ims_sttts_pc.yaml](configs/anon_ims_sttts_pc.yaml). If you are using this config, you need to modify at least the following entries:
-```
-data_dir: path to original data in Kaldi-format for anonymization
-results_dir: path to location for all (intermediate) results of the anonymization
-models_dir:  path to models location
+
+```YAML
+data_dir:    # path to original data in Kaldi-format for anonymization
+results_dir: # path to location for all (intermediate) results of the anonymization
+models_dir:  # path to models location
 ```
 
 Running an anonymization pipeline is done like this:
+
 ```
 python run_anonymization.py --config anon_ims_sttts_pc.yaml --gpu_ids 0,1 --force_compute
 ```
@@ -59,25 +67,28 @@ Pretrained models for this anonymization can be found at [https://github.
 com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0](https://github.com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0) and earlier releases.
 
 ### Evaluation
-All other config files in [configs](configs) can be used for evaluation with different settings. In these configs, 
-you need to adapt at least
+
+All other config files in [configs](configs) can be used for evaluation with different settings. In these configs, you need to adapt at least
+
 ```
 eval_data_dir: path to anonymized evaluation data in Kaldi-format
 asr/libri_dir: path to original LibriSpeech dataset
 ```
 
 Running an evaluation pipeline is done like this:
+
 ```
 python run_evaluation.py --config eval_pre_ecapa_cos.yaml --gpu_ids 1,2,3
 ```
-making the GPUs with IDs 1, 2 and 3 available to the process. If no GPU is specified, it will default to CUDA:0 or 
-use all GPUs if 
-cuda is available, or run on CPU otherwise.
 
-Pretrained evaluation models can be found in release v1. 
+making the GPUs with IDs 1, 2 and 3 available to the process. If no GPU is specified, it will default to CUDA:0 or use all GPUs if cuda is available, or run on CPU otherwise.
+
+Pretrained evaluation models can be found in release v1.
 
 ## Acknowledgements
+
 Several parts of this toolkit are based on or use code from external sources, i.e.,
+
 * [VoicePrivacy Challenge 2022](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022), [ESPnet](https://github.com/espnet/espnet/), [SpeechBrain](https://github.com/speechbrain/speechbrain/) for evaluation
 * the [GAN-based anonymization system by IMS (University of Stuttgart)](https://github.com/DigitalPhonetics/speaker-anonymization) 
   for 

diff --git a/anonymization/README.md b/anonymization/README.md
@@ -4,4 +4,9 @@ The anonymization branch can contain multiple pipelines, modules and models. So
 [Speech-to-Text-to-Speech (STTTS) pipeline](https://ieeexplore.ieee.org/document/10096607), based on this code:  
 [https://github.com/DigitalPhonetics/speaker-anonymization](https://github.com/DigitalPhonetics/speaker-anonymization).
 
+
+# Experiment with different speaker embedding mappings
+
+This is now simplified: you can define your anonymizer (a function that yields a speaker embedding when a speaker embedding is supplied) using the `!new` syntax of HyperPyYAML in a config file (e.g., see [ims_gan.yaml](../configs/anon/ims_gan.yaml)). The only requirement is that your anonymizer must implement the `BaseAnonymizer` API (see [base_anon.py](modules/speaker_embeddings/anonymization/base_anon.py)).
+
 *This documentation is still under construction and will be extended soon.*
diff --git a/anonymization/modules/prosody/extraction/ims_prosody_extraction.py b/anonymization/modules/prosody/extraction/ims_prosody_extraction.py
@@ -1,3 +1,4 @@
+import logging
 import torch
 torch.set_num_threads(1)
 
@@ -11,6 +12,7 @@
 from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator
 from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Parselmouth
 
+logger = logging.getLogger(__name__)
 
 class ImsProsodyExtractor:
 
@@ -54,7 +56,7 @@ def extract_prosody(self,
         try:
             norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave)
         except ValueError:
-            print('Something went wrong, the reference wave might be too short.')
+            logger.error('Something went wrong, the reference wave might be too short.')
             raise RuntimeError
 
         with torch.inference_mode():

diff --git a/anonymization/modules/prosody/prosody_extraction.py b/anonymization/modules/prosody/prosody_extraction.py
@@ -1,3 +1,4 @@
+import logging
 import torch
 torch.set_num_threads(1)
 
@@ -8,6 +9,7 @@
 from .extraction import *
 from utils import read_kaldi_format
 
+logger = logging.getLogger(__name__)
 
 class ProsodyExtraction:
 
@@ -47,7 +49,7 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None):
             wav_scp = {utt: wav_scp[utt] for utt in unprocessed_utts}
 
         if wav_scp:
-            print(f'Extract prosody for {len(wav_scp)} of {len(wav_scp) + len(data_prosody)} utterances')
+            logger.info(f'Extract prosody for {len(wav_scp)} of {len(wav_scp) + len(data_prosody)} utterances')
             data_prosody.new = True
             i = 0
             for utt, wav_path in tqdm(wav_scp.items()):
@@ -56,7 +58,7 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None):
                     utt_prosody = self.extractor.extract_prosody(transcript=text, ref_audio_path=wav_path,
                                                                  input_is_phones=text_is_phones)
                 except IndexError:
-                    print(f'Index Error for {utt}')
+                    logger.warn(f'IndexError for {utt}')
                     continue
                 duration, pitch, energy, start_silence, end_silence = utt_prosody
                 data_prosody.add_instance(utterance=utt, duration=duration, pitch=pitch, energy=energy,
@@ -69,8 +71,8 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None):
                 data_prosody.save_prosody(dataset_results_dir)
 
         elif len(data_prosody.utterances) > 0:
-            print('No prosody extraction necessary; load stored values instead...')
+            logger.info('No prosody extraction necessary; load stored values instead...')
         else:
-            print(f'No utterances could be found in {dataset_path}!')
+            logger.warn(f'No utterances could be found in {dataset_path}!')
 
         return data_prosody
diff --git a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py
@@ -1,22 +1,70 @@
 from pathlib import Path
 import torch
+import ruamel.yaml as yaml
+from ruamel.yaml.representer import RoundTripRepresenter, SafeRepresenter
+from typing import Union
 
 
 class BaseAnonymizer:
+    """
+    Base class for speaker embedding anonymizers, defining the API,
+    that consists of the following methods:
+    - anonymize_embeddings
+    - to
+    """
+    def __init__(
+        self,
+        vec_type: str,
+        device: Union[str, torch.device, int, None],
+        suffix: str,
+        **kwargs,
+    ):
+        assert suffix[0] == "_", "Suffix must be a string and start with an underscore."
 
-    def __init__(self, vec_type='xvector', device=None, **kwargs):
         # Base class for speaker embedding anonymization.
         self.vec_type = vec_type
+        self.suffix = suffix
 
         if isinstance(device, torch.device):
             self.device = device
         elif isinstance(device, str):
             self.device = torch.device(device)
         elif isinstance(device, int):
-            self.device = torch.device(f'cuda:{device}')
+            self.device = torch.device(f"cuda:{device}")
         else:
-            self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+            self.device = (
+                torch.device("cuda")
+                if torch.cuda.is_available()
+                else torch.device("cpu")
+            )
 
-    def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'):
+        # ensure dumpability
+        self.kwargs = kwargs
+        self.kwargs["vec_type"] = self.vec_type
+        self.kwargs["device"] = str(self.device)
+        self.kwargs["suffix"] = self.suffix
+
+    def __repr__(self):
+        if hasattr(self, "kwargs"):
+            return f"{self.__class__.__name__}({self.kwargs})"
+        else:
+            return f"{self.__class__.__name__}()"
+
+    def to_yaml(self, representer: yaml.Representer):
+        # first get data into dict format
+        data = {f"!new:{type(self).__qualname__}": self.kwargs}
+        return_str = representer.represent_dict(data)
+        return return_str
+
+    def anonymize_embeddings(self, speaker_embeddings: torch.Tensor, emb_level: str = "spk") -> torch.Tensor:
         # Template method for anonymizing a dataset. Not implemented.
-        raise NotImplementedError('anonymize_data')
+        raise NotImplementedError("anonymize_data")
+
+    def to(self, device):
+        self.device = device
+
+
+# necessary to make BaseAnonymizer and subclasses dumpable
+RoundTripRepresenter.add_multi_representer(
+    BaseAnonymizer, lambda representer, data: data.to_yaml(representer)
+)