From b544c1365d3e5b3484635ded288876754fcca908 Mon Sep 17 00:00:00 2001 From: Sarina Meyer Date: Wed, 6 Sep 2023 19:19:43 +0200 Subject: [PATCH] Initial Commit --- .gitignore | 129 + README.md | 88 +- anonymization/README.md | 7 + anonymization/__init__.py | 0 anonymization/modules/__init__.py | 4 + anonymization/modules/prosody/__init__.py | 2 + .../modules/prosody/anonymization/__init__.py | 1 + .../ims_prosody_anonymization.py | 29 + .../modules/prosody/extraction/__init__.py | 1 + .../extraction/ims_prosody_extraction.py | 153 + anonymization/modules/prosody/prosody.py | 117 + .../modules/prosody/prosody_anonymization.py | 49 + .../modules/prosody/prosody_extraction.py | 76 + .../modules/speaker_embeddings/__init__.py | 3 + .../anonymization/__init__.py | 3 + .../anonymization/base_anon.py | 22 + .../anonymization/gan_anon.py | 83 + .../anonymization/pool_anon.py | 154 + .../anonymization/random_anon.py | 68 + .../anonymization/utils/WGAN/__init__.py | 1 + .../utils/WGAN/embeddings_generator.py | 34 + .../anonymization/utils/WGAN/init_wgan.py | 65 + .../anonymization/utils/WGAN/resnet_1.py | 175 + .../anonymization/utils/WGAN/wgan_qc.py | 276 + .../anonymization/utils/__init__.py | 0 .../anonymization/utils/plda_model.py | 87 + .../speaker_embeddings/extraction/__init__.py | 0 .../extraction/embedding_methods/__init__.py | 2 + .../embedding_methods/speechbrain_vectors.py | 36 + .../embedding_methods/style_embeddings.py | 31 + .../ims_speaker_extraction_methods.py | 27 + .../speaker_anonymization.py | 74 + .../speaker_embeddings/speaker_embeddings.py | 144 + .../speaker_embeddings/speaker_extraction.py | 139 + anonymization/modules/text/__init__.py | 1 + .../modules/text/recognition/__init__.py | 0 .../modules/text/recognition/ims_asr.py | 59 + .../modules/text/speech_recognition.py | 149 + anonymization/modules/text/text.py | 122 + .../modules/tts/IMSToucan/.gitignore | 20 + .../InferenceInterfaces/AnonFastSpeech2.py | 223 + .../InferenceFastSpeech2.py | 310 + .../InferenceHiFiGAN.py | 91 + .../InferenceArchitectures/__init__.py | 0 .../IMSToucan/InferenceInterfaces/__init__.py | 0 anonymization/modules/tts/IMSToucan/LICENSE | 202 + .../modules/tts/IMSToucan/Layers/Attention.py | 324 + .../modules/tts/IMSToucan/Layers/Conformer.py | 128 + .../tts/IMSToucan/Layers/Convolution.py | 55 + .../tts/IMSToucan/Layers/DurationPredictor.py | 139 + .../tts/IMSToucan/Layers/EncoderLayer.py | 144 + .../modules/tts/IMSToucan/Layers/LayerNorm.py | 36 + .../tts/IMSToucan/Layers/LengthRegulator.py | 62 + .../IMSToucan/Layers/MultiLayeredConv1d.py | 87 + .../tts/IMSToucan/Layers/MultiSequential.py | 33 + .../IMSToucan/Layers/PositionalEncoding.py | 166 + .../Layers/PositionwiseFeedForward.py | 26 + .../modules/tts/IMSToucan/Layers/PostNet.py | 74 + .../tts/IMSToucan/Layers/RNNAttention.py | 282 + .../tts/IMSToucan/Layers/ResidualBlock.py | 98 + .../tts/IMSToucan/Layers/ResidualStack.py | 51 + .../modules/tts/IMSToucan/Layers/STFT.py | 118 + .../modules/tts/IMSToucan/Layers/Swish.py | 18 + .../tts/IMSToucan/Layers/VariancePredictor.py | 65 + .../modules/tts/IMSToucan/Layers/__init__.py | 0 .../Preprocessing/AudioPreprocessor.py | 168 + .../IMSToucan/Preprocessing/TextFrontend.py | 463 + .../tts/IMSToucan/Preprocessing/__init__.py | 0 .../Preprocessing/articulatory_features.py | 946 + anonymization/modules/tts/IMSToucan/README.md | 329 + .../Spectrogram_to_Embedding/GST.py | 225 + .../StyleEmbedding.py | 67 + .../Spectrogram_to_Embedding/__init__.py | 0 .../AutoAligner/Aligner.py | 297 + .../AutoAligner/__init__.py | 0 .../FastSpeech2/DurationCalculator.py | 31 + .../FastSpeech2/EnergyCalculator.py | 94 + .../FastSpeech2/PitchCalculator.py | 119 + .../FastSpeech2/__init__.py | 0 .../Text_to_Spectrogram/__init__.py | 0 .../IMSToucan/TrainingInterfaces/__init__.py | 0 .../modules/tts/IMSToucan/Utility/__init__.py | 0 .../modules/tts/IMSToucan/Utility/utils.py | 320 + .../modules/tts/IMSToucan/UtteranceCloner.py | 278 + .../modules/tts/IMSToucan/__init__.py | 0 .../modules/tts/IMSToucan/requirements.txt | 26 + anonymization/modules/tts/__init__.py | 1 + anonymization/modules/tts/ims_tts.py | 55 + anonymization/modules/tts/speech_synthesis.py | 145 + anonymization/pipelines/__init__.py | 0 anonymization/pipelines/sttts_pipeline.py | 112 + configs/anon_ims_sttts_pc.yaml | 97 + configs/eval_gvd_both.yaml | 70 + configs/eval_post_ecapa_cos_ft.yaml | 104 + configs/eval_post_ecapa_cos_scratch.yaml | 104 + configs/eval_post_xvector_plda_scratch.yaml | 104 + configs/eval_pre_ecapa_cos.yaml | 104 + configs/eval_pre_ecapa_plda.yaml | 104 + configs/eval_pre_xvector_cos.yaml | 104 + configs/eval_pre_xvector_plda.yaml | 104 + evaluation/README.md | 27 + evaluation/__init__.py | 2 + evaluation/privacy/__init__.py | 1 + evaluation/privacy/asv/__init__.py | 2 + evaluation/privacy/asv/asv.py | 167 + evaluation/privacy/asv/asv_train/__init__.py | 1 + .../privacy/asv/asv_train/asv_dataset.py | 76 + .../asv_train/hparams/ecapa/hyperparams.yaml | 53 + .../asv/asv_train/hparams/hyperparams.yaml | 53 + .../hparams/train_ecapa_tdnn_small.yaml | 120 + .../hparams/train_ecapa_tdnn_small_ft.yaml | 136 + .../asv_train/hparams/train_x_vectors.yaml | 120 + .../asv_train/hparams/train_x_vectors_ft.yaml | 137 + .../asv_train/hparams/verification_ecapa.yaml | 80 + .../hparams/verification_plda_xvector.yaml | 81 + .../hparams/xvector/hyperparams.yaml | 56 + .../privacy/asv/asv_train/libri_prepare.py | 432 + .../asv/asv_train/speechbrain_defaults.py | 14 + .../asv/asv_train/train_speaker_embeddings.py | 160 + evaluation/privacy/asv/metrics/__init__.py | 0 evaluation/privacy/asv/metrics/cllr.py | 129 + evaluation/privacy/asv/metrics/helpers.py | 349 + evaluation/privacy/asv/metrics/linkability.py | 87 + .../privacy/asv/metrics/utils/__init__.py | 0 evaluation/privacy/asv/metrics/utils/io.py | 131 + .../privacy/asv/metrics/utils/plo_plots.py | 352 + .../asv/metrics/utils/visualization.py | 160 + .../privacy/asv/metrics/utils/zebra_plots.py | 175 + evaluation/privacy/asv/metrics/zebra.py | 104 + evaluation/utility/__init__.py | 1 + evaluation/utility/asr/__init__.py | 0 evaluation/utility/asr/asr.sh | 1840 + evaluation/utility/asr/asr_old.sh | 1675 + evaluation/utility/asr/cmd.sh | 110 + evaluation/utility/asr/conf/decode_asr.yaml | 3 + .../utility/asr/conf/decode_asr_anon.yaml | 3 + .../utility/asr/conf/decode_asr_rnnt.yaml | 5 + .../conf/decode_asr_transformer_with_k2.yaml | 7 + evaluation/utility/asr/conf/fbank.conf | 2 + evaluation/utility/asr/conf/pbs.conf | 11 + evaluation/utility/asr/conf/pitch.conf | 1 + evaluation/utility/asr/conf/queue.conf | 12 + evaluation/utility/asr/conf/slurm.conf | 27 + .../asr/conf/train_asr_branchformer.yaml | 81 + .../utility/asr/conf/train_asr_conformer.yaml | 76 + .../utility/asr/conf/train_asr_rnnt.yaml | 82 + .../asr/conf/train_asr_transformer.yaml | 62 + .../asr/conf/train_asr_transformer_anon.yaml | 62 + .../asr/conf/train_lm_transformer.yaml | 31 + evaluation/utility/asr/conf/train_rnn_lm.yaml | 16 + ...chformer_hop_length160_e18_linear3072.yaml | 81 + .../asr/conf/tuning/train_asr_conformer.yaml | 68 + .../train_asr_conformer10_hop_length160.yaml | 76 + .../asr/conf/tuning/train_asr_conformer4.yaml | 68 + .../asr/conf/tuning/train_asr_conformer5.yaml | 68 + ...asr_conformer6_n_fft400_hop_length160.yaml | 72 + ...asr_conformer6_n_fft512_hop_length128.yaml | 72 + ...asr_conformer6_n_fft512_hop_length256.yaml | 72 + ...ain_asr_conformer7_hubert_ll60k_large.yaml | 85 + ...asr_conformer7_n_fft512_hop_length256.yaml | 72 + ...n_asr_conformer7_wav2vec2_960hr_large.yaml | 86 + .../train_asr_conformer7_wavlm_large.yaml | 85 + .../asr/conf/tuning/train_asr_conformer8.yaml | 76 + ...ain_asr_conformer9_layerdrop0.1_last6.yaml | 90 + .../conf/tuning/train_asr_e_branchformer.yaml | 83 + .../conf/tuning/train_asr_transformer.yaml | 62 + .../conf/tuning/train_asr_transformer3.yaml | 62 + ...arge_lv60_960h_finetuning_last_1layer.yaml | 66 + .../asr/conf/tuning/train_lm_adam.yaml | 16 + .../conf/tuning/train_lm_transformer2.yaml | 31 + .../tuning/train_lm_transformer2_anon.yaml | 31 + .../asr/conf/tuning/transducer/decode.yaml | 5 + .../train_conformer-rnn_transducer.yaml | 82 + .../en_token_list/bpe_unigram5000/bpe.model | Bin 0 -> 325499 bytes .../en_token_list/bpe_unigram5000/bpe.vocab | 5000 + .../en_token_list/bpe_unigram5000/tokens.txt | 5000 + .../en_token_list/bpe_unigram5000/train.txt | 104014 +++++++++++++++ evaluation/utility/asr/db.sh | 324 + evaluation/utility/asr/local/__init__.py | 0 evaluation/utility/asr/local/data.sh | 99 + evaluation/utility/asr/local/data_prep.sh | 85 + .../utility/asr/local/data_prep_anon.py | 11 + .../utility/asr/local/download_and_untar.sh | 97 + evaluation/utility/asr/local/path.sh | 0 evaluation/utility/asr/path.sh | 23 + evaluation/utility/asr/pyscripts/__init__.py | 0 .../utility/asr/pyscripts/audio/__init__.py | 0 .../asr/pyscripts/audio/format_wav_scp.py | 354 + .../asr/pyscripts/audio/trim_silence.py | 150 + .../utility/asr/pyscripts/feats/__init__.py | 0 .../asr/pyscripts/feats/feat-to-shape.py | 83 + .../utility/asr/pyscripts/utils/__init__.py | 0 .../pyscripts/utils/convert_text_to_phn.py | 74 + .../asr/pyscripts/utils/evaluate_f0.py | 344 + .../asr/pyscripts/utils/evaluate_mcd.py | 331 + .../asr/pyscripts/utils/extract_xvectors.py | 185 + .../utility/asr/pyscripts/utils/get_yaml.py | 37 + .../utils/make_token_list_from_config.py | 33 + .../asr/pyscripts/utils/plot_sinc_filters.py | 357 + .../utility/asr/pyscripts/utils/print_args.py | 45 + .../pyscripts/utils/remove_duplicate_keys.py | 49 + .../asr/pyscripts/utils/rotate_logfile.py | 59 + .../asr/pyscripts/utils/score_intent.py | 123 + .../asr/pyscripts/utils/score_lang_id.py | 60 + .../pyscripts/utils/score_summarization.py | 49 + .../asr/pyscripts/utils/utt2spk_to_utt2sid.py | 33 + evaluation/utility/asr/run.sh | 68 + evaluation/utility/asr/scripts/__init__.py | 0 .../asr/scripts/audio/format_wav_scp.sh | 146 + .../utility/asr/scripts/audio/trim_silence.sh | 85 + .../asr/scripts/feats/feat_to_shape.sh | 73 + .../utility/asr/scripts/feats/make_fbank.sh | 172 + .../utility/asr/scripts/feats/make_stft.sh | 154 + .../utility/asr/scripts/text/run_spm.sh | 39 + .../asr/scripts/utils/TEMPLATE_HF_Readme.md | 113 + .../asr/scripts/utils/TEMPLATE_Readme.md | 43 + .../utility/asr/scripts/utils/__init__.py | 0 .../asr/scripts/utils/create_README_file.py | 64 + .../utils/download_from_google_drive.sh | 51 + .../asr/scripts/utils/eval_perm_free_error.py | 203 + .../utility/asr/scripts/utils/evaluate_asr.sh | 284 + .../asr/scripts/utils/get_model_names.py | 8 + evaluation/utility/asr/scripts/utils/mfa.sh | 306 + .../scripts/utils/perturb_data_dir_speed.sh | 119 + .../utility/asr/scripts/utils/print_args.sh | 59 + .../asr/scripts/utils/remove_punctuation.pl | 25 + .../asr/scripts/utils/show_asr_result.sh | 81 + .../scripts/utils/show_translation_result.sh | 70 + .../asr/scripts/utils/upload_models_to_hub.sh | 44 + evaluation/utility/asr/steps/__init__.py | 0 .../utility/asr/steps/align_basis_fmllr.sh | 168 + .../asr/steps/align_basis_fmllr_lats.sh | 184 + evaluation/utility/asr/steps/align_fmllr.sh | 158 + .../utility/asr/steps/align_fmllr_lats.sh | 172 + evaluation/utility/asr/steps/align_lvtln.sh | 199 + evaluation/utility/asr/steps/align_mapped.sh | 91 + .../utility/asr/steps/align_raw_fmllr.sh | 151 + evaluation/utility/asr/steps/align_sgmm2.sh | 200 + evaluation/utility/asr/steps/align_si.sh | 106 + evaluation/utility/asr/steps/append_feats.sh | 85 + .../utility/asr/steps/best_path_weights.sh | 118 + .../utility/asr/steps/chain/__init__.py | 0 .../utility/asr/steps/chain/build_tree.sh | 200 + .../chain/build_tree_multiple_sources.sh | 275 + .../utility/asr/steps/chain/e2e/README.txt | 18 + .../utility/asr/steps/chain/e2e/__init__.py | 0 .../steps/chain/e2e/compute_biphone_stats.py | 72 + .../asr/steps/chain/e2e/get_egs_e2e.sh | 415 + .../asr/steps/chain/e2e/prepare_e2e.sh | 120 + .../asr/steps/chain/e2e/text_to_phones.py | 74 + .../utility/asr/steps/chain/e2e/train_e2e.py | 589 + .../utility/asr/steps/chain/gen_topo.pl | 42 + .../utility/asr/steps/chain/gen_topo.py | 48 + .../utility/asr/steps/chain/gen_topo2.py | 55 + .../utility/asr/steps/chain/gen_topo3.py | 41 + .../utility/asr/steps/chain/gen_topo4.py | 46 + .../utility/asr/steps/chain/gen_topo5.py | 50 + .../utility/asr/steps/chain/gen_topo_orig.py | 53 + evaluation/utility/asr/steps/chain/get_egs.sh | 547 + .../asr/steps/chain/get_model_context.sh | 107 + .../utility/asr/steps/chain/get_phone_post.sh | 244 + .../asr/steps/chain/make_weighted_den_fst.sh | 155 + .../steps/chain/multilingual/combine_egs.sh | 168 + evaluation/utility/asr/steps/chain/train.py | 665 + .../utility/asr/steps/chain/train_tdnn.sh | 635 + .../utility/asr/steps/chain2/__init__.py | 0 .../utility/asr/steps/chain2/combine_egs.sh | 167 + .../chain2/compute_preconditioning_matrix.sh | 84 + .../utility/asr/steps/chain2/get_raw_egs.sh | 304 + .../asr/steps/chain2/internal/__init__.py | 0 .../steps/chain2/internal/get_best_model.sh | 44 + .../chain2/internal/get_train_schedule.py | 159 + .../utility/asr/steps/chain2/process_egs.sh | 159 + .../utility/asr/steps/chain2/randomize_egs.sh | 161 + evaluation/utility/asr/steps/chain2/train.sh | 334 + .../steps/chain2/validate_processed_egs.sh | 50 + .../steps/chain2/validate_randomized_egs.sh | 66 + .../asr/steps/chain2/validate_raw_egs.sh | 47 + .../utility/asr/steps/cleanup/__init__.py | 0 .../steps/cleanup/clean_and_segment_data.sh | 214 + .../cleanup/clean_and_segment_data_nnet3.sh | 286 + .../steps/cleanup/combine_short_segments.py | 319 + .../steps/cleanup/create_segments_from_ctm.pl | 481 + .../asr/steps/cleanup/debug_lexicon.sh | 215 + .../cleanup/decode_fmllr_segmentation.sh | 258 + .../asr/steps/cleanup/decode_segmentation.sh | 173 + .../cleanup/decode_segmentation_nnet3.sh | 174 + .../asr/steps/cleanup/find_bad_utts.sh | 200 + .../asr/steps/cleanup/find_bad_utts_nnet.sh | 205 + .../asr/steps/cleanup/internal/__init__.py | 0 .../steps/cleanup/internal/align_ctm_ref.py | 627 + .../steps/cleanup/internal/compute_tf_idf.py | 145 + .../asr/steps/cleanup/internal/ctm_to_text.pl | 84 + .../steps/cleanup/internal/get_ctm_edits.py | 351 + .../cleanup/internal/get_non_scored_words.py | 112 + .../steps/cleanup/internal/get_pron_stats.py | 228 + .../cleanup/internal/make_one_biased_lm.py | 321 + .../cleanup/internal/modify_ctm_edits.py | 448 + .../internal/resolve_ctm_edits_overlaps.py | 347 + .../cleanup/internal/retrieve_similar_docs.py | 358 + .../cleanup/internal/segment_ctm_edits.py | 1050 + .../internal/segment_ctm_edits_mild.py | 2074 + .../cleanup/internal/split_text_into_docs.pl | 72 + .../cleanup/internal/stitch_documents.py | 157 + .../steps/cleanup/internal/taint_ctm_edits.py | 252 + .../asr/steps/cleanup/internal/tf_idf.py | 424 + .../asr/steps/cleanup/lattice_oracle_align.sh | 200 + .../steps/cleanup/make_biased_lm_graphs.sh | 167 + .../asr/steps/cleanup/make_biased_lms.py | 110 + .../cleanup/make_segmentation_data_dir.sh | 206 + .../steps/cleanup/make_segmentation_graph.sh | 144 + .../asr/steps/cleanup/make_utterance_fsts.pl | 49 + .../asr/steps/cleanup/make_utterance_graph.sh | 175 + .../steps/cleanup/segment_long_utterances.sh | 469 + .../cleanup/segment_long_utterances_nnet3.sh | 552 + .../asr/steps/cleanup/split_long_utterance.sh | 146 + .../utility/asr/steps/combine_ali_dirs.sh | 215 + .../utility/asr/steps/combine_lat_dirs.sh | 215 + .../utility/asr/steps/combine_trans_dirs.sh | 133 + .../utility/asr/steps/compare_alignments.sh | 220 + .../utility/asr/steps/compute_cmvn_stats.sh | 117 + .../utility/asr/steps/compute_vad_decision.sh | 86 + evaluation/utility/asr/steps/conf/__init__.py | 0 .../asr/steps/conf/append_eval_to_ctm.py | 82 + .../asr/steps/conf/append_prf_to_ctm.py | 76 + .../asr/steps/conf/apply_calibration.sh | 91 + .../asr/steps/conf/convert_ctm_to_tra.py | 38 + .../utility/asr/steps/conf/get_ctm_conf.sh | 100 + .../asr/steps/conf/lattice_depth_per_frame.sh | 39 + .../asr/steps/conf/parse_arpa_unigrams.py | 39 + .../steps/conf/prepare_calibration_data.py | 120 + .../asr/steps/conf/prepare_word_categories.py | 56 + .../asr/steps/conf/train_calibration.sh | 125 + evaluation/utility/asr/steps/copy_ali_dir.sh | 75 + evaluation/utility/asr/steps/copy_lat_dir.sh | 74 + .../utility/asr/steps/copy_trans_dir.sh | 80 + evaluation/utility/asr/steps/data/__init__.py | 0 .../asr/steps/data/augment_data_dir.py | 298 + .../steps/data/data_dir_manipulation_lib.py | 17 + .../utility/asr/steps/data/make_musan.py | 178 + .../utility/asr/steps/data/make_musan.sh | 71 + .../asr/steps/data/reverberate_data_dir.py | 682 + evaluation/utility/asr/steps/decode.sh | 142 + .../utility/asr/steps/decode_basis_fmllr.sh | 234 + evaluation/utility/asr/steps/decode_biglm.sh | 92 + .../utility/asr/steps/decode_combine.sh | 65 + evaluation/utility/asr/steps/decode_fmllr.sh | 229 + .../utility/asr/steps/decode_fmllr_extra.sh | 260 + evaluation/utility/asr/steps/decode_fmmi.sh | 118 + .../utility/asr/steps/decode_fromlats.sh | 95 + evaluation/utility/asr/steps/decode_lvtln.sh | 189 + evaluation/utility/asr/steps/decode_nnet.sh | 169 + evaluation/utility/asr/steps/decode_nolats.sh | 146 + .../utility/asr/steps/decode_raw_fmllr.sh | 234 + evaluation/utility/asr/steps/decode_sgmm2.sh | 226 + .../asr/steps/decode_sgmm2_fromlats.sh | 272 + .../utility/asr/steps/decode_sgmm2_rescore.sh | 113 + .../asr/steps/decode_sgmm2_rescore_project.sh | 177 + evaluation/utility/asr/steps/decode_si.sh | 142 + .../utility/asr/steps/decode_with_map.sh | 119 + .../utility/asr/steps/diagnostic/__init__.py | 0 .../steps/diagnostic/analyze_alignments.sh | 61 + .../asr/steps/diagnostic/analyze_lats.sh | 81 + .../diagnostic/analyze_lattice_depth_stats.py | 182 + .../diagnostic/analyze_phone_length_stats.py | 277 + evaluation/utility/asr/steps/dict/__init__.py | 0 .../utility/asr/steps/dict/apply_g2p.sh | 89 + .../asr/steps/dict/apply_g2p_phonetisaurus.sh | 99 + .../asr/steps/dict/apply_lexicon_edits.py | 110 + .../utility/asr/steps/dict/get_pron_stats.py | 97 + .../asr/steps/dict/internal/__init__.py | 0 .../steps/dict/internal/get_subsegments.py | 140 + .../dict/internal/prune_pron_candidates.py | 158 + .../asr/steps/dict/internal/sum_arc_info.py | 136 + .../asr/steps/dict/learn_lexicon_bayesian.sh | 433 + .../asr/steps/dict/learn_lexicon_greedy.sh | 546 + .../asr/steps/dict/merge_learned_lexicons.py | 261 + .../asr/steps/dict/prons_to_lexicon.py | 209 + .../asr/steps/dict/prune_pron_candidates.py | 121 + .../asr/steps/dict/select_prons_bayesian.py | 446 + .../asr/steps/dict/select_prons_greedy.py | 376 + .../utility/asr/steps/dict/train_g2p.sh | 88 + .../asr/steps/dict/train_g2p_phonetisaurus.sh | 88 + evaluation/utility/asr/steps/get_ctm.sh | 93 + evaluation/utility/asr/steps/get_ctm_conf.sh | 100 + .../utility/asr/steps/get_ctm_conf_fast.sh | 87 + evaluation/utility/asr/steps/get_ctm_fast.sh | 86 + .../utility/asr/steps/get_fmllr_basis.sh | 94 + .../utility/asr/steps/get_lexicon_probs.sh | 227 + evaluation/utility/asr/steps/get_prons.sh | 198 + evaluation/utility/asr/steps/get_train_ctm.sh | 101 + .../utility/asr/steps/info/chain_dir_info.pl | 326 + .../utility/asr/steps/info/gmm_dir_info.pl | 324 + .../utility/asr/steps/info/nnet2_dir_info.pl | 282 + .../utility/asr/steps/info/nnet3_dir_info.pl | 318 + .../asr/steps/info/nnet3_disc_dir_info.pl | 172 + evaluation/utility/asr/steps/libs/__init__.py | 11 + evaluation/utility/asr/steps/libs/common.py | 498 + .../utility/asr/steps/libs/nnet3/__init__.py | 13 + .../asr/steps/libs/nnet3/report/__init__.py | 8 + .../asr/steps/libs/nnet3/report/log_parse.py | 543 + .../asr/steps/libs/nnet3/train/__init__.py | 11 + .../libs/nnet3/train/chain_objf/__init__.py | 12 + .../nnet3/train/chain_objf/acoustic_model.py | 632 + .../asr/steps/libs/nnet3/train/common.py | 1037 + .../libs/nnet3/train/dropout_schedule.py | 373 + .../nnet3/train/frame_level_objf/__init__.py | 14 + .../train/frame_level_objf/acoustic_model.py | 88 + .../nnet3/train/frame_level_objf/common.py | 712 + .../nnet3/train/frame_level_objf/raw_model.py | 81 + .../asr/steps/libs/nnet3/xconfig/__init__.py | 39 + .../asr/steps/libs/nnet3/xconfig/attention.py | 249 + .../steps/libs/nnet3/xconfig/basic_layers.py | 1366 + .../libs/nnet3/xconfig/composite_layers.py | 329 + .../steps/libs/nnet3/xconfig/convolution.py | 1203 + .../asr/steps/libs/nnet3/xconfig/gru.py | 2111 + .../asr/steps/libs/nnet3/xconfig/layers.py | 13 + .../asr/steps/libs/nnet3/xconfig/lstm.py | 1198 + .../asr/steps/libs/nnet3/xconfig/parser.py | 207 + .../steps/libs/nnet3/xconfig/stats_layer.py | 145 + .../libs/nnet3/xconfig/trivial_layers.py | 724 + .../asr/steps/libs/nnet3/xconfig/utils.py | 692 + evaluation/utility/asr/steps/lmrescore.sh | 150 + .../utility/asr/steps/lmrescore_const_arpa.sh | 70 + .../lmrescore_const_arpa_undeterminized.sh | 105 + .../utility/asr/steps/lmrescore_rnnlm_lat.sh | 106 + evaluation/utility/asr/steps/make_denlats.sh | 175 + .../utility/asr/steps/make_denlats_sgmm2.sh | 201 + evaluation/utility/asr/steps/make_fbank.sh | 186 + .../utility/asr/steps/make_fbank_pitch.sh | 214 + evaluation/utility/asr/steps/make_index.sh | 118 + evaluation/utility/asr/steps/make_mfcc.sh | 194 + .../utility/asr/steps/make_mfcc_pitch.sh | 214 + .../asr/steps/make_mfcc_pitch_online.sh | 204 + .../utility/asr/steps/make_phone_graph.sh | 146 + evaluation/utility/asr/steps/make_plp.sh | 187 + .../utility/asr/steps/make_plp_pitch.sh | 212 + evaluation/utility/asr/steps/nnet/align.sh | 139 + evaluation/utility/asr/steps/nnet/decode.sh | 169 + .../steps/nnet/ivector/extract_ivectors.sh | 213 + .../asr/steps/nnet/ivector/train_diag_ubm.sh | 145 + .../nnet/ivector/train_ivector_extractor.sh | 179 + .../utility/asr/steps/nnet/make_bn_feats.sh | 138 + .../utility/asr/steps/nnet/make_denlats.sh | 217 + .../asr/steps/nnet/make_fmllr_feats.sh | 99 + .../utility/asr/steps/nnet/make_fmmi_feats.sh | 102 + .../utility/asr/steps/nnet/make_priors.sh | 101 + .../utility/asr/steps/nnet/pretrain_dbn.sh | 340 + evaluation/utility/asr/steps/nnet/train.sh | 475 + .../utility/asr/steps/nnet/train_mmi.sh | 231 + .../utility/asr/steps/nnet/train_mpe.sh | 223 + .../utility/asr/steps/nnet/train_scheduler.sh | 206 + .../utility/asr/steps/nnet2/__init__.py | 0 .../utility/asr/steps/nnet2/adjust_priors.sh | 80 + evaluation/utility/asr/steps/nnet2/align.sh | 129 + .../steps/nnet2/check_ivectors_compatible.sh | 40 + .../asr/steps/nnet2/convert_lda_to_raw.sh | 161 + .../asr/steps/nnet2/convert_nnet1_to_nnet2.sh | 73 + .../asr/steps/nnet2/create_appended_model.sh | 75 + evaluation/utility/asr/steps/nnet2/decode.sh | 164 + .../steps/nnet2/dump_bottleneck_features.sh | 116 + evaluation/utility/asr/steps/nnet2/get_egs.sh | 299 + .../utility/asr/steps/nnet2/get_egs2.sh | 337 + .../steps/nnet2/get_egs_discriminative2.sh | 349 + .../utility/asr/steps/nnet2/get_ivector_id.sh | 42 + evaluation/utility/asr/steps/nnet2/get_lda.sh | 188 + .../utility/asr/steps/nnet2/get_lda_block.sh | 127 + .../utility/asr/steps/nnet2/get_num_frames.sh | 25 + .../asr/steps/nnet2/get_perturbed_feats.sh | 104 + .../utility/asr/steps/nnet2/make_denlats.sh | 208 + .../steps/nnet2/make_multisplice_configs.py | 141 + .../utility/asr/steps/nnet2/relabel_egs.sh | 90 + .../utility/asr/steps/nnet2/relabel_egs2.sh | 90 + .../utility/asr/steps/nnet2/remove_egs.sh | 46 + .../utility/asr/steps/nnet2/retrain_fast.sh | 399 + .../asr/steps/nnet2/retrain_simple2.sh | 518 + .../utility/asr/steps/nnet2/retrain_tanh.sh | 218 + .../utility/asr/steps/nnet2/train_block.sh | 388 + .../asr/steps/nnet2/train_convnet_accel2.sh | 678 + .../asr/steps/nnet2/train_discriminative.sh | 414 + .../asr/steps/nnet2/train_discriminative2.sh | 273 + .../nnet2/train_discriminative_multilang2.sh | 302 + .../utility/asr/steps/nnet2/train_more.sh | 272 + .../utility/asr/steps/nnet2/train_more2.sh | 351 + .../asr/steps/nnet2/train_multilang2.sh | 553 + .../steps/nnet2/train_multisplice_accel2.sh | 680 + .../steps/nnet2/train_multisplice_ensemble.sh | 666 + .../utility/asr/steps/nnet2/train_pnorm.sh | 487 + .../asr/steps/nnet2/train_pnorm_accel2.sh | 627 + .../nnet2/train_pnorm_bottleneck_fast.sh | 505 + .../asr/steps/nnet2/train_pnorm_ensemble.sh | 407 + .../asr/steps/nnet2/train_pnorm_fast.sh | 511 + .../steps/nnet2/train_pnorm_multisplice.sh | 572 + .../steps/nnet2/train_pnorm_multisplice2.sh | 597 + .../asr/steps/nnet2/train_pnorm_simple.sh | 549 + .../asr/steps/nnet2/train_pnorm_simple2.sh | 631 + .../utility/asr/steps/nnet2/train_tanh.sh | 419 + .../asr/steps/nnet2/train_tanh_bottleneck.sh | 413 + .../asr/steps/nnet2/train_tanh_fast.sh | 452 + .../utility/asr/steps/nnet2/update_nnet.sh | 276 + .../utility/asr/steps/nnet3/__init__.py | 0 .../utility/asr/steps/nnet3/adjust_priors.sh | 97 + evaluation/utility/asr/steps/nnet3/align.sh | 140 + .../utility/asr/steps/nnet3/align_lats.sh | 159 + .../utility/asr/steps/nnet3/chain/__init__.py | 0 .../asr/steps/nnet3/chain/build_tree.sh | 200 + .../chain/build_tree_multiple_sources.sh | 275 + .../asr/steps/nnet3/chain/e2e/README.txt | 18 + .../asr/steps/nnet3/chain/e2e/__init__.py | 0 .../nnet3/chain/e2e/compute_biphone_stats.py | 72 + .../asr/steps/nnet3/chain/e2e/get_egs_e2e.sh | 415 + .../asr/steps/nnet3/chain/e2e/prepare_e2e.sh | 120 + .../steps/nnet3/chain/e2e/text_to_phones.py | 74 + .../asr/steps/nnet3/chain/e2e/train_e2e.py | 589 + .../utility/asr/steps/nnet3/chain/gen_topo.pl | 42 + .../utility/asr/steps/nnet3/chain/gen_topo.py | 48 + .../asr/steps/nnet3/chain/gen_topo2.py | 55 + .../asr/steps/nnet3/chain/gen_topo3.py | 41 + .../asr/steps/nnet3/chain/gen_topo4.py | 46 + .../asr/steps/nnet3/chain/gen_topo5.py | 50 + .../asr/steps/nnet3/chain/gen_topo_orig.py | 53 + .../utility/asr/steps/nnet3/chain/get_egs.sh | 547 + .../steps/nnet3/chain/get_model_context.sh | 107 + .../asr/steps/nnet3/chain/get_phone_post.sh | 244 + .../nnet3/chain/make_weighted_den_fst.sh | 155 + .../nnet3/chain/multilingual/combine_egs.sh | 168 + .../utility/asr/steps/nnet3/chain/train.py | 665 + .../asr/steps/nnet3/chain/train_tdnn.sh | 635 + .../asr/steps/nnet3/chain2/__init__.py | 0 .../asr/steps/nnet3/chain2/combine_egs.sh | 167 + .../chain2/compute_preconditioning_matrix.sh | 84 + .../asr/steps/nnet3/chain2/get_raw_egs.sh | 304 + .../steps/nnet3/chain2/internal/__init__.py | 0 .../nnet3/chain2/internal/get_best_model.sh | 44 + .../chain2/internal/get_train_schedule.py | 159 + .../asr/steps/nnet3/chain2/process_egs.sh | 159 + .../asr/steps/nnet3/chain2/randomize_egs.sh | 161 + .../utility/asr/steps/nnet3/chain2/train.sh | 334 + .../nnet3/chain2/validate_processed_egs.sh | 50 + .../nnet3/chain2/validate_randomized_egs.sh | 66 + .../steps/nnet3/chain2/validate_raw_egs.sh | 47 + .../utility/asr/steps/nnet3/components.py | 488 + .../utility/asr/steps/nnet3/compute_output.sh | 128 + .../asr/steps/nnet3/convert_nnet2_to_nnet3.py | 486 + evaluation/utility/asr/steps/nnet3/decode.sh | 173 + .../utility/asr/steps/nnet3/decode_grammar.sh | 139 + .../asr/steps/nnet3/decode_lookahead.sh | 148 + .../utility/asr/steps/nnet3/decode_looped.sh | 150 + .../asr/steps/nnet3/decode_score_fusion.sh | 276 + .../utility/asr/steps/nnet3/decode_semisup.sh | 165 + .../utility/asr/steps/nnet3/dot/__init__.py | 0 .../asr/steps/nnet3/dot/descriptor_parser.py | 88 + .../asr/steps/nnet3/dot/nnet3_to_dot.py | 469 + .../utility/asr/steps/nnet3/get_degs.sh | 476 + evaluation/utility/asr/steps/nnet3/get_egs.sh | 450 + .../asr/steps/nnet3/get_egs_discriminative.sh | 420 + .../asr/steps/nnet3/get_egs_targets.sh | 445 + .../utility/asr/steps/nnet3/get_saturation.pl | 106 + .../asr/steps/nnet3/get_successful_models.py | 64 + .../utility/asr/steps/nnet3/lstm/__init__.py | 0 .../asr/steps/nnet3/lstm/make_configs.py | 376 + .../utility/asr/steps/nnet3/lstm/train.sh | 744 + .../steps/nnet3/make_bottleneck_features.sh | 145 + .../utility/asr/steps/nnet3/make_denlats.sh | 208 + .../asr/steps/nnet3/make_tdnn_configs.py | 239 + .../asr/steps/nnet3/multilingual/__init__.py | 0 .../allocate_multilingual_examples.py | 232 + .../steps/nnet3/multilingual/combine_egs.sh | 163 + .../utility/asr/steps/nnet3/nnet3_to_dot.sh | 41 + .../utility/asr/steps/nnet3/remove_egs.sh | 46 + .../asr/steps/nnet3/report/__init__.py | 0 .../asr/steps/nnet3/report/convert_model.py | 498 + .../asr/steps/nnet3/report/generate_plots.py | 823 + .../report/summarize_compute_debug_timing.py | 112 + .../utility/asr/steps/nnet3/tdnn/__init__.py | 0 .../asr/steps/nnet3/tdnn/make_configs.py | 567 + .../utility/asr/steps/nnet3/tdnn/train.sh | 671 + .../asr/steps/nnet3/tdnn/train_raw_nnet.sh | 556 + .../asr/steps/nnet3/train_discriminative.sh | 373 + .../utility/asr/steps/nnet3/train_dnn.py | 474 + .../utility/asr/steps/nnet3/train_raw_dnn.py | 516 + .../utility/asr/steps/nnet3/train_raw_rnn.py | 575 + .../utility/asr/steps/nnet3/train_rnn.py | 563 + .../utility/asr/steps/nnet3/train_tdnn.sh | 665 + .../asr/steps/nnet3/xconfig_to_config.py | 106 + .../asr/steps/nnet3/xconfig_to_configs.py | 342 + evaluation/utility/asr/steps/online/decode.sh | 104 + .../utility/asr/steps/online/nnet2/align.sh | 82 + .../asr/steps/online/nnet2/copy_data_dir.sh | 89 + .../steps/online/nnet2/copy_ivector_dir.sh | 48 + .../utility/asr/steps/online/nnet2/decode.sh | 144 + .../online/nnet2/dump_nnet_activations.sh | 132 + .../steps/online/nnet2/extract_ivectors.sh | 294 + .../online/nnet2/extract_ivectors_online.sh | 147 + .../utility/asr/steps/online/nnet2/get_egs.sh | 285 + .../asr/steps/online/nnet2/get_egs2.sh | 288 + .../online/nnet2/get_egs_discriminative2.sh | 246 + .../steps/online/nnet2/get_pca_transform.sh | 67 + .../asr/steps/online/nnet2/make_denlats.sh | 167 + .../online/nnet2/prepare_online_decoding.sh | 167 + .../nnet2/prepare_online_decoding_retrain.sh | 118 + .../nnet2/prepare_online_decoding_transfer.sh | 79 + .../asr/steps/online/nnet2/train_diag_ubm.sh | 170 + .../online/nnet2/train_ivector_extractor.sh | 211 + .../utility/asr/steps/online/nnet3/decode.sh | 151 + .../steps/online/nnet3/decode_wake_word.sh | 141 + .../online/nnet3/prepare_online_decoding.sh | 212 + .../steps/online/prepare_online_decoding.sh | 256 + evaluation/utility/asr/steps/oracle_wer.sh | 118 + .../utility/asr/steps/overlap/__init__.py | 0 .../steps/overlap/detect_overlaps_pyannote.py | 40 + .../steps/overlap/detect_overlaps_pyannote.sh | 65 + .../asr/steps/overlap/get_overlap_segments.py | 140 + .../asr/steps/overlap/get_overlap_targets.py | 157 + .../asr/steps/overlap/output_to_rttm.py | 340 + .../asr/steps/overlap/post_process_output.sh | 70 + .../steps/overlap/prepare_overlap_graph.py | 207 + evaluation/utility/asr/steps/paste_feats.sh | 85 + .../utility/asr/steps/pytorchnn/__init__.py | 0 .../utility/asr/steps/pytorchnn/check_py.py | 2 + .../pytorchnn/compute_sentence_scores.py | 304 + .../utility/asr/steps/pytorchnn/data.py | 53 + .../pytorchnn/estimate_arc_nnlm_scores.py | 125 + .../pytorchnn/lmrescore_lattice_pytorchnn.sh | 155 + .../pytorchnn/lmrescore_nbest_pytorchnn.sh | 231 + .../utility/asr/steps/pytorchnn/model.py | 157 + .../utility/asr/steps/pytorchnn/train.py | 261 + .../utility/asr/steps/resegment_data.sh | 130 + .../utility/asr/steps/resegment_text.sh | 125 + evaluation/utility/asr/steps/rnnlmrescore.sh | 217 + evaluation/utility/asr/steps/score_kaldi.sh | 149 + .../utility/asr/steps/score_kaldi_compare.sh | 50 + .../asr/steps/scoring/score_kaldi_cer.sh | 200 + .../asr/steps/scoring/score_kaldi_compare.sh | 50 + .../asr/steps/scoring/score_kaldi_wer.sh | 149 + evaluation/utility/asr/steps/search_index.sh | 59 + .../asr/steps/segmentation/__init__.py | 0 .../asr/steps/segmentation/ali_to_targets.sh | 118 + .../segmentation/combine_targets_dirs.sh | 55 + .../convert_targets_dir_to_whole_recording.sh | 123 + .../convert_utt2spk_and_segments_to_rttm.py | 101 + .../steps/segmentation/copy_targets_dir.sh | 46 + .../asr/steps/segmentation/decode_sad.sh | 61 + .../segmentation/detect_speech_activity.sh | 251 + .../segmentation/evaluate_segmentation.pl | 198 + .../get_targets_for_out_of_segments.sh | 99 + .../steps/segmentation/internal/__init__.py | 0 .../internal/arc_info_to_targets.py | 161 + .../segmentation/internal/find_oov_phone.py | 57 + ...get_default_targets_for_out_of_segments.py | 180 + .../internal/get_transform_probs_mat.py | 79 + .../merge_segment_targets_to_recording.py | 291 + .../segmentation/internal/merge_targets.py | 214 + .../internal/prepare_sad_graph.py | 164 + .../segmentation/internal/resample_targets.py | 122 + .../segmentation/internal/sad_to_segments.py | 333 + .../internal/verify_phones_list.py | 41 + .../asr/steps/segmentation/lats_to_targets.sh | 137 + .../steps/segmentation/merge_targets_dirs.sh | 107 + .../post_process_sad_to_segments.sh | 66 + .../steps/segmentation/prepare_targets_gmm.sh | 317 + .../segmentation/resample_targets_dir.sh | 101 + .../segmentation/validate_targets_dir.sh | 87 + evaluation/utility/asr/steps/select_feats.sh | 89 + evaluation/utility/asr/steps/shift_feats.sh | 90 + .../utility/asr/steps/subset_ali_dir.sh | 67 + .../utility/asr/steps/tandem/align_fmllr.sh | 188 + .../utility/asr/steps/tandem/align_sgmm2.sh | 235 + .../utility/asr/steps/tandem/align_si.sh | 132 + evaluation/utility/asr/steps/tandem/decode.sh | 147 + .../utility/asr/steps/tandem/decode_fmllr.sh | 246 + .../utility/asr/steps/tandem/decode_sgmm2.sh | 240 + .../utility/asr/steps/tandem/decode_si.sh | 147 + .../utility/asr/steps/tandem/make_denlats.sh | 208 + .../asr/steps/tandem/make_denlats_sgmm2.sh | 199 + .../asr/steps/tandem/mk_aslf_lda_mllt.sh | 177 + .../utility/asr/steps/tandem/mk_aslf_sgmm2.sh | 178 + .../utility/asr/steps/tandem/train_deltas.sh | 166 + .../asr/steps/tandem/train_lda_mllt.sh | 260 + .../utility/asr/steps/tandem/train_mllt.sh | 239 + .../utility/asr/steps/tandem/train_mmi.sh | 187 + .../asr/steps/tandem/train_mmi_sgmm2.sh | 193 + .../utility/asr/steps/tandem/train_mono.sh | 162 + .../utility/asr/steps/tandem/train_sat.sh | 281 + .../utility/asr/steps/tandem/train_sgmm2.sh | 337 + .../utility/asr/steps/tandem/train_ubm.sh | 171 + .../utility/asr/steps/tfrnnlm/__init__.py | 0 .../utility/asr/steps/tfrnnlm/check_py.py | 2 + .../tfrnnlm/check_tensorflow_installed.sh | 26 + .../asr/steps/tfrnnlm/lmrescore_rnnlm_lat.sh | 101 + .../tfrnnlm/lmrescore_rnnlm_lat_pruned.sh | 94 + evaluation/utility/asr/steps/tfrnnlm/lstm.py | 240 + .../utility/asr/steps/tfrnnlm/lstm_fast.py | 162 + .../utility/asr/steps/tfrnnlm/reader.py | 121 + .../asr/steps/tfrnnlm/vanilla_rnnlm.py | 324 + evaluation/utility/asr/steps/train_deltas.sh | 173 + .../utility/asr/steps/train_diag_ubm.sh | 130 + .../utility/asr/steps/train_lda_mllt.sh | 238 + evaluation/utility/asr/steps/train_lvtln.sh | 356 + evaluation/utility/asr/steps/train_map.sh | 119 + evaluation/utility/asr/steps/train_mmi.sh | 152 + .../utility/asr/steps/train_mmi_fmmi.sh | 230 + .../asr/steps/train_mmi_fmmi_indirect.sh | 254 + .../utility/asr/steps/train_mmi_sgmm2.sh | 159 + evaluation/utility/asr/steps/train_mono.sh | 156 + evaluation/utility/asr/steps/train_mpe.sh | 163 + evaluation/utility/asr/steps/train_nnet.sh | 475 + evaluation/utility/asr/steps/train_quick.sh | 202 + evaluation/utility/asr/steps/train_raw_sat.sh | 297 + evaluation/utility/asr/steps/train_sat.sh | 278 + .../utility/asr/steps/train_sat_basis.sh | 275 + .../utility/asr/steps/train_segmenter.sh | 147 + evaluation/utility/asr/steps/train_sgmm2.sh | 300 + .../utility/asr/steps/train_sgmm2_group.sh | 354 + evaluation/utility/asr/steps/train_smbr.sh | 163 + evaluation/utility/asr/steps/train_ubm.sh | 143 + .../utility/asr/steps/word_align_lattices.sh | 48 + evaluation/utility/asr/utils/__init__.py | 0 evaluation/utility/asr/utils/add_disambig.pl | 58 + .../utility/asr/utils/add_lex_disambig.pl | 196 + .../utility/asr/utils/analyze_segments.pl | 43 + evaluation/utility/asr/utils/apply_map.pl | 97 + evaluation/utility/asr/utils/best_wer.sh | 32 + .../utility/asr/utils/build_const_arpa_lm.sh | 53 + .../asr/utils/build_kenlm_model_from_arpa.sh | 44 + evaluation/utility/asr/utils/combine_data.sh | 146 + evaluation/utility/asr/utils/convert_ctm.pl | 96 + evaluation/utility/asr/utils/convert_slf.pl | 302 + .../utility/asr/utils/convert_slf_parallel.sh | 71 + evaluation/utility/asr/utils/copy_data_dir.sh | 149 + .../utility/asr/utils/create_data_link.pl | 132 + .../utility/asr/utils/create_split_dir.pl | 92 + evaluation/utility/asr/utils/ctm/__init__.py | 0 .../utility/asr/utils/ctm/convert_ctm.pl | 96 + evaluation/utility/asr/utils/ctm/fix_ctm.sh | 32 + .../asr/utils/ctm/resolve_ctm_overlaps.py | 324 + evaluation/utility/asr/utils/data/__init__.py | 0 .../utility/asr/utils/data/combine_data.sh | 146 + .../asr/utils/data/combine_short_segments.sh | 187 + .../utils/data/convert_data_dir_to_whole.sh | 62 + .../utility/asr/utils/data/copy_data_dir.sh | 149 + .../asr/utils/data/extend_segment_times.py | 118 + .../data/extract_wav_segments_data_dir.sh | 59 + .../utility/asr/utils/data/fix_data_dir.sh | 215 + .../asr/utils/data/fix_subsegment_feats.pl | 106 + .../asr/utils/data/get_allowed_durations.py | 219 + .../utility/asr/utils/data/get_frame_shift.sh | 74 + .../utility/asr/utils/data/get_num_frames.sh | 25 + .../utility/asr/utils/data/get_reco2dur.sh | 143 + .../asr/utils/data/get_reco2utt_for_data.sh | 21 + .../asr/utils/data/get_segments_for_data.sh | 29 + .../asr/utils/data/get_uniform_subsegments.py | 121 + .../utility/asr/utils/data/get_utt2dur.sh | 135 + .../asr/utils/data/get_utt2num_frames.sh | 46 + .../asr/utils/data/internal/__init__.py | 0 .../data/internal/choose_utts_to_combine.py | 394 + .../internal/combine_segments_to_recording.py | 66 + .../data/internal/modify_speaker_info.py | 115 + .../asr/utils/data/internal/perturb_volume.py | 115 + .../asr/utils/data/limit_feature_dim.sh | 48 + .../asr/utils/data/modify_speaker_info.sh | 125 + .../data/modify_speaker_info_to_recording.sh | 50 + .../asr/utils/data/normalize_data_range.pl | 128 + .../asr/utils/data/perturb_data_dir_speed.sh | 149 + .../utils/data/perturb_data_dir_speed_3way.sh | 89 + .../asr/utils/data/perturb_data_dir_volume.sh | 93 + .../data/perturb_speed_to_allowed_lengths.py | 336 + .../utility/asr/utils/data/remove_dup_utts.sh | 61 + .../asr/utils/data/resample_data_dir.sh | 44 + .../asr/utils/data/shift_and_combine_feats.sh | 67 + .../utility/asr/utils/data/shift_feats.sh | 55 + .../utility/asr/utils/data/split_data.sh | 160 + .../asr/utils/data/subsegment_data_dir.sh | 249 + .../utility/asr/utils/data/subset_data_dir.sh | 196 + .../asr/utils/data/validate_data_dir.sh | 404 + .../asr/utils/dict_dir_add_pronprobs.sh | 252 + evaluation/utility/asr/utils/eps2disambig.pl | 29 + evaluation/utility/asr/utils/filt.py | 15 + evaluation/utility/asr/utils/filter_scp.pl | 87 + evaluation/utility/asr/utils/filter_scps.pl | 170 + .../utility/asr/utils/find_arpa_oovs.pl | 71 + evaluation/utility/asr/utils/fix_ctm.sh | 32 + evaluation/utility/asr/utils/fix_data_dir.sh | 215 + evaluation/utility/asr/utils/format_lm.sh | 85 + evaluation/utility/asr/utils/format_lm_sri.sh | 91 + evaluation/utility/asr/utils/gen_topo.pl | 79 + .../asr/utils/generate_selected_speakers.py | 22 + evaluation/utility/asr/utils/int2sym.pl | 71 + .../utility/asr/utils/kwslist_post_process.pl | 291 + evaluation/utility/asr/utils/lang/__init__.py | 0 .../asr/utils/lang/add_lex_disambig.pl | 196 + .../asr/utils/lang/add_unigrams_arpa.pl | 93 + .../utility/asr/utils/lang/adjust_unk_arpa.pl | 68 + .../asr/utils/lang/adjust_unk_graph.sh | 38 + .../utility/asr/utils/lang/bpe/__init__.py | 0 .../lang/bpe/add_final_optional_silence.sh | 57 + .../utility/asr/utils/lang/bpe/apply_bpe.py | 340 + evaluation/utility/asr/utils/lang/bpe/bidi.py | 58 + .../utility/asr/utils/lang/bpe/learn_bpe.py | 258 + .../asr/utils/lang/bpe/prepend_words.py | 16 + .../utility/asr/utils/lang/bpe/reverse.py | 13 + .../asr/utils/lang/check_g_properties.pl | 90 + .../asr/utils/lang/check_phones_compatible.sh | 59 + .../utils/lang/compute_sentence_probs_arpa.py | 155 + .../utility/asr/utils/lang/extend_lang.sh | 264 + .../utils/lang/get_word_position_phone_map.pl | 154 + .../asr/utils/lang/grammar/__init__.py | 0 .../utils/lang/grammar/augment_phones_txt.py | 96 + .../utils/lang/grammar/augment_words_txt.py | 96 + .../asr/utils/lang/internal/__init__.py | 0 .../asr/utils/lang/internal/apply_unk_lm.sh | 91 + .../lang/internal/arpa2fst_constrained.py | 395 + .../utils/lang/internal/modify_unk_pron.py | 108 + .../asr/utils/lang/limit_arpa_unk_history.py | 161 + .../utility/asr/utils/lang/make_kn_lm.py | 379 + .../asr/utils/lang/make_lexicon_fst.py | 411 + .../utils/lang/make_lexicon_fst_silprob.py | 408 + .../asr/utils/lang/make_phone_bigram_lang.sh | 123 + .../utility/asr/utils/lang/make_phone_lm.py | 886 + ...make_position_dependent_subword_lexicon.py | 107 + .../utils/lang/make_subword_lexicon_fst.py | 301 + .../utility/asr/utils/lang/make_unk_lm.sh | 315 + .../asr/utils/lang/ngram_entropy_pruning.py | 627 + .../utility/asr/utils/lang/prepare_lang.sh | 567 + .../utils/lang/validate_disambig_sym_file.pl | 81 + .../utility/asr/utils/lang/validate_lang.pl | 1076 + evaluation/utility/asr/utils/ln.pl | 58 + evaluation/utility/asr/utils/make_absolute.sh | 21 + .../utility/asr/utils/make_lexicon_fst.pl | 160 + .../asr/utils/make_lexicon_fst_silprob.pl | 151 + .../utility/asr/utils/make_unigram_grammar.pl | 54 + evaluation/utility/asr/utils/map_arpa_lm.pl | 137 + evaluation/utility/asr/utils/mkgraph.sh | 186 + .../utility/asr/utils/mkgraph_lookahead.sh | 208 + .../asr/utils/nnet-cpu/make_nnet_config.pl | 160 + .../utils/nnet-cpu/make_nnet_config_block.pl | 157 + .../make_nnet_config_preconditioned.pl | 278 + .../utils/nnet-cpu/update_learning_rates.pl | 142 + evaluation/utility/asr/utils/nnet/__init__.py | 0 .../utility/asr/utils/nnet/gen_dct_mat.py | 72 + .../utility/asr/utils/nnet/gen_hamm_mat.py | 62 + .../utility/asr/utils/nnet/gen_splice.py | 55 + .../asr/utils/nnet/make_blstm_proto.py | 95 + .../utility/asr/utils/nnet/make_cnn_proto.py | 165 + .../utility/asr/utils/nnet/make_lstm_proto.py | 87 + .../utility/asr/utils/nnet/make_nnet_proto.py | 235 + .../asr/utils/nnet/subset_data_tr_cv.sh | 64 + .../utility/asr/utils/nnet3/__init__.py | 0 .../nnet3/convert_config_tdnn_to_affine.py | 99 + .../asr/utils/parallel/limit_num_gpus.sh | 58 + evaluation/utility/asr/utils/parallel/pbs.pl | 585 + .../utility/asr/utils/parallel/queue.pl | 624 + .../utility/asr/utils/parallel/retry.pl | 105 + evaluation/utility/asr/utils/parallel/run.pl | 356 + .../utility/asr/utils/parallel/slurm.pl | 638 + evaluation/utility/asr/utils/parse_options.sh | 97 + evaluation/utility/asr/utils/pbs.pl | 585 + .../asr/utils/perturb_data_dir_speed.sh | 149 + evaluation/utility/asr/utils/pinyin_map.pl | 79 + .../asr/utils/prepare_extended_lang.sh | 166 + evaluation/utility/asr/utils/prepare_lang.sh | 567 + .../utils/prepare_online_nnet_dist_build.sh | 75 + evaluation/utility/asr/utils/queue.pl | 624 + .../utility/asr/utils/remove_data_links.sh | 53 + evaluation/utility/asr/utils/remove_oovs.pl | 43 + .../utility/asr/utils/require_argument.sh | 22 + .../utility/asr/utils/require_argument_all.sh | 8 + evaluation/utility/asr/utils/retry.pl | 105 + evaluation/utility/asr/utils/reverse_arpa.py | 188 + .../utility/asr/utils/rnnlm_compute_scores.sh | 90 + evaluation/utility/asr/utils/run.pl | 356 + evaluation/utility/asr/utils/s2eps.pl | 27 + .../asr/utils/scoring/wer_ops_details.pl | 223 + .../asr/utils/scoring/wer_per_spk_details.pl | 258 + .../asr/utils/scoring/wer_per_utt_details.pl | 222 + .../utility/asr/utils/scoring/wer_report.pl | 55 + evaluation/utility/asr/utils/segmentation.pl | 403 + evaluation/utility/asr/utils/show_lattice.sh | 43 + evaluation/utility/asr/utils/shuffle_list.pl | 44 + evaluation/utility/asr/utils/slurm.pl | 638 + .../utility/asr/utils/spk2utt_to_utt2spk.pl | 27 + evaluation/utility/asr/utils/split_data.sh | 160 + evaluation/utility/asr/utils/split_scp.pl | 246 + evaluation/utility/asr/utils/ssh.pl | 219 + .../utility/asr/utils/subset_data_dir.sh | 196 + .../asr/utils/subset_data_dir_tr_cv.sh | 64 + evaluation/utility/asr/utils/subset_scp.pl | 105 + .../asr/utils/subword/prepare_lang_subword.sh | 448 + .../asr/utils/subword/prepare_subword_text.sh | 48 + .../utility/asr/utils/summarize_logs.pl | 121 + .../utility/asr/utils/summarize_warnings.pl | 46 + evaluation/utility/asr/utils/sym2int.pl | 104 + .../asr/utils/train_arpa_with_kenlm.sh | 67 + .../utility/asr/utils/utt2spk_to_spk2utt.pl | 38 + .../utility/asr/utils/validate_data_dir.sh | 404 + .../utility/asr/utils/validate_dict_dir.pl | 531 + evaluation/utility/asr/utils/validate_lang.pl | 1076 + evaluation/utility/asr/utils/validate_text.pl | 136 + evaluation/utility/asr/utils/write_kwslist.pl | 346 + .../utility/voice_distinctiveness/__init__.py | 0 .../utility/voice_distinctiveness/deid_gvd.py | 157 + figures/framework.png | Bin 0 -> 110105 bytes requirements.txt | 12 + run_anonymization.py | 33 + run_evaluation.py | 367 + utils/__init__.py | 3 + utils/convert_results.py | 200 + utils/data_io.py | 95 + utils/path_management.py | 54 + utils/prepare_results_in_kaldi_format.py | 132 + 910 files changed, 276052 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 anonymization/README.md create mode 100644 anonymization/__init__.py create mode 100644 anonymization/modules/__init__.py create mode 100644 anonymization/modules/prosody/__init__.py create mode 100644 anonymization/modules/prosody/anonymization/__init__.py create mode 100644 anonymization/modules/prosody/anonymization/ims_prosody_anonymization.py create mode 100644 anonymization/modules/prosody/extraction/__init__.py create mode 100644 anonymization/modules/prosody/extraction/ims_prosody_extraction.py create mode 100644 anonymization/modules/prosody/prosody.py create mode 100644 anonymization/modules/prosody/prosody_anonymization.py create mode 100644 anonymization/modules/prosody/prosody_extraction.py create mode 100644 anonymization/modules/speaker_embeddings/__init__.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/__init__.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/base_anon.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/gan_anon.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/pool_anon.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/random_anon.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/__init__.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/embeddings_generator.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/init_wgan.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/resnet_1.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/wgan_qc.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/utils/__init__.py create mode 100644 anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py create mode 100644 anonymization/modules/speaker_embeddings/extraction/__init__.py create mode 100644 anonymization/modules/speaker_embeddings/extraction/embedding_methods/__init__.py create mode 100644 anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py create mode 100644 anonymization/modules/speaker_embeddings/extraction/embedding_methods/style_embeddings.py create mode 100644 anonymization/modules/speaker_embeddings/extraction/ims_speaker_extraction_methods.py create mode 100644 anonymization/modules/speaker_embeddings/speaker_anonymization.py create mode 100644 anonymization/modules/speaker_embeddings/speaker_embeddings.py create mode 100644 anonymization/modules/speaker_embeddings/speaker_extraction.py create mode 100644 anonymization/modules/text/__init__.py create mode 100644 anonymization/modules/text/recognition/__init__.py create mode 100644 anonymization/modules/text/recognition/ims_asr.py create mode 100644 anonymization/modules/text/speech_recognition.py create mode 100644 anonymization/modules/text/text.py create mode 100644 anonymization/modules/tts/IMSToucan/.gitignore create mode 100644 anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py create mode 100644 anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/InferenceFastSpeech2.py create mode 100644 anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/InferenceHiFiGAN.py create mode 100644 anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/InferenceInterfaces/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/LICENSE create mode 100644 anonymization/modules/tts/IMSToucan/Layers/Attention.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/Conformer.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/Convolution.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/DurationPredictor.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/EncoderLayer.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/LayerNorm.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/LengthRegulator.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/MultiLayeredConv1d.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/MultiSequential.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/PositionalEncoding.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/PositionwiseFeedForward.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/PostNet.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/RNNAttention.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/ResidualBlock.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/ResidualStack.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/STFT.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/Swish.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/VariancePredictor.py create mode 100644 anonymization/modules/tts/IMSToucan/Layers/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/Preprocessing/AudioPreprocessor.py create mode 100644 anonymization/modules/tts/IMSToucan/Preprocessing/TextFrontend.py create mode 100644 anonymization/modules/tts/IMSToucan/Preprocessing/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/Preprocessing/articulatory_features.py create mode 100644 anonymization/modules/tts/IMSToucan/README.md create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Spectrogram_to_Embedding/GST.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Spectrogram_to_Embedding/StyleEmbedding.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Spectrogram_to_Embedding/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/Aligner.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/DurationCalculator.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/EnergyCalculator.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/PitchCalculator.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/Text_to_Spectrogram/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/TrainingInterfaces/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/Utility/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/Utility/utils.py create mode 100644 anonymization/modules/tts/IMSToucan/UtteranceCloner.py create mode 100644 anonymization/modules/tts/IMSToucan/__init__.py create mode 100644 anonymization/modules/tts/IMSToucan/requirements.txt create mode 100644 anonymization/modules/tts/__init__.py create mode 100644 anonymization/modules/tts/ims_tts.py create mode 100644 anonymization/modules/tts/speech_synthesis.py create mode 100644 anonymization/pipelines/__init__.py create mode 100644 anonymization/pipelines/sttts_pipeline.py create mode 100644 configs/anon_ims_sttts_pc.yaml create mode 100644 configs/eval_gvd_both.yaml create mode 100644 configs/eval_post_ecapa_cos_ft.yaml create mode 100644 configs/eval_post_ecapa_cos_scratch.yaml create mode 100644 configs/eval_post_xvector_plda_scratch.yaml create mode 100644 configs/eval_pre_ecapa_cos.yaml create mode 100644 configs/eval_pre_ecapa_plda.yaml create mode 100644 configs/eval_pre_xvector_cos.yaml create mode 100644 configs/eval_pre_xvector_plda.yaml create mode 100644 evaluation/README.md create mode 100644 evaluation/__init__.py create mode 100644 evaluation/privacy/__init__.py create mode 100644 evaluation/privacy/asv/__init__.py create mode 100644 evaluation/privacy/asv/asv.py create mode 100644 evaluation/privacy/asv/asv_train/__init__.py create mode 100644 evaluation/privacy/asv/asv_train/asv_dataset.py create mode 100755 evaluation/privacy/asv/asv_train/hparams/ecapa/hyperparams.yaml create mode 100755 evaluation/privacy/asv/asv_train/hparams/hyperparams.yaml create mode 100644 evaluation/privacy/asv/asv_train/hparams/train_ecapa_tdnn_small.yaml create mode 100644 evaluation/privacy/asv/asv_train/hparams/train_ecapa_tdnn_small_ft.yaml create mode 100644 evaluation/privacy/asv/asv_train/hparams/train_x_vectors.yaml create mode 100644 evaluation/privacy/asv/asv_train/hparams/train_x_vectors_ft.yaml create mode 100755 evaluation/privacy/asv/asv_train/hparams/verification_ecapa.yaml create mode 100755 evaluation/privacy/asv/asv_train/hparams/verification_plda_xvector.yaml create mode 100644 evaluation/privacy/asv/asv_train/hparams/xvector/hyperparams.yaml create mode 100644 evaluation/privacy/asv/asv_train/libri_prepare.py create mode 100644 evaluation/privacy/asv/asv_train/speechbrain_defaults.py create mode 100644 evaluation/privacy/asv/asv_train/train_speaker_embeddings.py create mode 100644 evaluation/privacy/asv/metrics/__init__.py create mode 100644 evaluation/privacy/asv/metrics/cllr.py create mode 100644 evaluation/privacy/asv/metrics/helpers.py create mode 100644 evaluation/privacy/asv/metrics/linkability.py create mode 100644 evaluation/privacy/asv/metrics/utils/__init__.py create mode 100644 evaluation/privacy/asv/metrics/utils/io.py create mode 100644 evaluation/privacy/asv/metrics/utils/plo_plots.py create mode 100644 evaluation/privacy/asv/metrics/utils/visualization.py create mode 100644 evaluation/privacy/asv/metrics/utils/zebra_plots.py create mode 100644 evaluation/privacy/asv/metrics/zebra.py create mode 100755 evaluation/utility/__init__.py create mode 100644 evaluation/utility/asr/__init__.py create mode 100755 evaluation/utility/asr/asr.sh create mode 100755 evaluation/utility/asr/asr_old.sh create mode 100755 evaluation/utility/asr/cmd.sh create mode 100755 evaluation/utility/asr/conf/decode_asr.yaml create mode 100755 evaluation/utility/asr/conf/decode_asr_anon.yaml create mode 100755 evaluation/utility/asr/conf/decode_asr_rnnt.yaml create mode 100755 evaluation/utility/asr/conf/decode_asr_transformer_with_k2.yaml create mode 100755 evaluation/utility/asr/conf/fbank.conf create mode 100755 evaluation/utility/asr/conf/pbs.conf create mode 100755 evaluation/utility/asr/conf/pitch.conf create mode 100755 evaluation/utility/asr/conf/queue.conf create mode 100755 evaluation/utility/asr/conf/slurm.conf create mode 100755 evaluation/utility/asr/conf/train_asr_branchformer.yaml create mode 100755 evaluation/utility/asr/conf/train_asr_conformer.yaml create mode 100755 evaluation/utility/asr/conf/train_asr_rnnt.yaml create mode 100755 evaluation/utility/asr/conf/train_asr_transformer.yaml create mode 100755 evaluation/utility/asr/conf/train_asr_transformer_anon.yaml create mode 100755 evaluation/utility/asr/conf/train_lm_transformer.yaml create mode 100755 evaluation/utility/asr/conf/train_rnn_lm.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_branchformer_hop_length160_e18_linear3072.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer10_hop_length160.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer4.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer5.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer6_n_fft400_hop_length160.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer6_n_fft512_hop_length128.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer6_n_fft512_hop_length256.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer7_wavlm_large.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer8.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_e_branchformer.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_transformer.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_transformer3.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_asr_transformer3_w2v_large_lv60_960h_finetuning_last_1layer.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_lm_adam.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_lm_transformer2.yaml create mode 100755 evaluation/utility/asr/conf/tuning/train_lm_transformer2_anon.yaml create mode 100755 evaluation/utility/asr/conf/tuning/transducer/decode.yaml create mode 100755 evaluation/utility/asr/conf/tuning/transducer/train_conformer-rnn_transducer.yaml create mode 100644 evaluation/utility/asr/data/en_token_list/bpe_unigram5000/bpe.model create mode 100644 evaluation/utility/asr/data/en_token_list/bpe_unigram5000/bpe.vocab create mode 100644 evaluation/utility/asr/data/en_token_list/bpe_unigram5000/tokens.txt create mode 100644 evaluation/utility/asr/data/en_token_list/bpe_unigram5000/train.txt create mode 100755 evaluation/utility/asr/db.sh create mode 100644 evaluation/utility/asr/local/__init__.py create mode 100755 evaluation/utility/asr/local/data.sh create mode 100755 evaluation/utility/asr/local/data_prep.sh create mode 100755 evaluation/utility/asr/local/data_prep_anon.py create mode 100755 evaluation/utility/asr/local/download_and_untar.sh create mode 100755 evaluation/utility/asr/local/path.sh create mode 100755 evaluation/utility/asr/path.sh create mode 100644 evaluation/utility/asr/pyscripts/__init__.py create mode 100644 evaluation/utility/asr/pyscripts/audio/__init__.py create mode 100755 evaluation/utility/asr/pyscripts/audio/format_wav_scp.py create mode 100755 evaluation/utility/asr/pyscripts/audio/trim_silence.py create mode 100644 evaluation/utility/asr/pyscripts/feats/__init__.py create mode 100755 evaluation/utility/asr/pyscripts/feats/feat-to-shape.py create mode 100644 evaluation/utility/asr/pyscripts/utils/__init__.py create mode 100755 evaluation/utility/asr/pyscripts/utils/convert_text_to_phn.py create mode 100755 evaluation/utility/asr/pyscripts/utils/evaluate_f0.py create mode 100755 evaluation/utility/asr/pyscripts/utils/evaluate_mcd.py create mode 100755 evaluation/utility/asr/pyscripts/utils/extract_xvectors.py create mode 100755 evaluation/utility/asr/pyscripts/utils/get_yaml.py create mode 100755 evaluation/utility/asr/pyscripts/utils/make_token_list_from_config.py create mode 100755 evaluation/utility/asr/pyscripts/utils/plot_sinc_filters.py create mode 100755 evaluation/utility/asr/pyscripts/utils/print_args.py create mode 100755 evaluation/utility/asr/pyscripts/utils/remove_duplicate_keys.py create mode 100755 evaluation/utility/asr/pyscripts/utils/rotate_logfile.py create mode 100755 evaluation/utility/asr/pyscripts/utils/score_intent.py create mode 100755 evaluation/utility/asr/pyscripts/utils/score_lang_id.py create mode 100755 evaluation/utility/asr/pyscripts/utils/score_summarization.py create mode 100755 evaluation/utility/asr/pyscripts/utils/utt2spk_to_utt2sid.py create mode 100755 evaluation/utility/asr/run.sh create mode 100644 evaluation/utility/asr/scripts/__init__.py create mode 100755 evaluation/utility/asr/scripts/audio/format_wav_scp.sh create mode 100755 evaluation/utility/asr/scripts/audio/trim_silence.sh create mode 100755 evaluation/utility/asr/scripts/feats/feat_to_shape.sh create mode 100755 evaluation/utility/asr/scripts/feats/make_fbank.sh create mode 100755 evaluation/utility/asr/scripts/feats/make_stft.sh create mode 100755 evaluation/utility/asr/scripts/text/run_spm.sh create mode 100755 evaluation/utility/asr/scripts/utils/TEMPLATE_HF_Readme.md create mode 100755 evaluation/utility/asr/scripts/utils/TEMPLATE_Readme.md create mode 100644 evaluation/utility/asr/scripts/utils/__init__.py create mode 100755 evaluation/utility/asr/scripts/utils/create_README_file.py create mode 100755 evaluation/utility/asr/scripts/utils/download_from_google_drive.sh create mode 100755 evaluation/utility/asr/scripts/utils/eval_perm_free_error.py create mode 100755 evaluation/utility/asr/scripts/utils/evaluate_asr.sh create mode 100755 evaluation/utility/asr/scripts/utils/get_model_names.py create mode 100755 evaluation/utility/asr/scripts/utils/mfa.sh create mode 100755 evaluation/utility/asr/scripts/utils/perturb_data_dir_speed.sh create mode 100755 evaluation/utility/asr/scripts/utils/print_args.sh create mode 100755 evaluation/utility/asr/scripts/utils/remove_punctuation.pl create mode 100755 evaluation/utility/asr/scripts/utils/show_asr_result.sh create mode 100755 evaluation/utility/asr/scripts/utils/show_translation_result.sh create mode 100755 evaluation/utility/asr/scripts/utils/upload_models_to_hub.sh create mode 100644 evaluation/utility/asr/steps/__init__.py create mode 100755 evaluation/utility/asr/steps/align_basis_fmllr.sh create mode 100755 evaluation/utility/asr/steps/align_basis_fmllr_lats.sh create mode 100755 evaluation/utility/asr/steps/align_fmllr.sh create mode 100755 evaluation/utility/asr/steps/align_fmllr_lats.sh create mode 100755 evaluation/utility/asr/steps/align_lvtln.sh create mode 100755 evaluation/utility/asr/steps/align_mapped.sh create mode 100755 evaluation/utility/asr/steps/align_raw_fmllr.sh create mode 100755 evaluation/utility/asr/steps/align_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/align_si.sh create mode 100755 evaluation/utility/asr/steps/append_feats.sh create mode 100755 evaluation/utility/asr/steps/best_path_weights.sh create mode 100644 evaluation/utility/asr/steps/chain/__init__.py create mode 100755 evaluation/utility/asr/steps/chain/build_tree.sh create mode 100755 evaluation/utility/asr/steps/chain/build_tree_multiple_sources.sh create mode 100755 evaluation/utility/asr/steps/chain/e2e/README.txt create mode 100644 evaluation/utility/asr/steps/chain/e2e/__init__.py create mode 100755 evaluation/utility/asr/steps/chain/e2e/compute_biphone_stats.py create mode 100755 evaluation/utility/asr/steps/chain/e2e/get_egs_e2e.sh create mode 100755 evaluation/utility/asr/steps/chain/e2e/prepare_e2e.sh create mode 100755 evaluation/utility/asr/steps/chain/e2e/text_to_phones.py create mode 100755 evaluation/utility/asr/steps/chain/e2e/train_e2e.py create mode 100755 evaluation/utility/asr/steps/chain/gen_topo.pl create mode 100755 evaluation/utility/asr/steps/chain/gen_topo.py create mode 100755 evaluation/utility/asr/steps/chain/gen_topo2.py create mode 100755 evaluation/utility/asr/steps/chain/gen_topo3.py create mode 100755 evaluation/utility/asr/steps/chain/gen_topo4.py create mode 100755 evaluation/utility/asr/steps/chain/gen_topo5.py create mode 100755 evaluation/utility/asr/steps/chain/gen_topo_orig.py create mode 100755 evaluation/utility/asr/steps/chain/get_egs.sh create mode 100755 evaluation/utility/asr/steps/chain/get_model_context.sh create mode 100755 evaluation/utility/asr/steps/chain/get_phone_post.sh create mode 100755 evaluation/utility/asr/steps/chain/make_weighted_den_fst.sh create mode 100755 evaluation/utility/asr/steps/chain/multilingual/combine_egs.sh create mode 100755 evaluation/utility/asr/steps/chain/train.py create mode 100755 evaluation/utility/asr/steps/chain/train_tdnn.sh create mode 100644 evaluation/utility/asr/steps/chain2/__init__.py create mode 100755 evaluation/utility/asr/steps/chain2/combine_egs.sh create mode 100755 evaluation/utility/asr/steps/chain2/compute_preconditioning_matrix.sh create mode 100755 evaluation/utility/asr/steps/chain2/get_raw_egs.sh create mode 100644 evaluation/utility/asr/steps/chain2/internal/__init__.py create mode 100755 evaluation/utility/asr/steps/chain2/internal/get_best_model.sh create mode 100755 evaluation/utility/asr/steps/chain2/internal/get_train_schedule.py create mode 100755 evaluation/utility/asr/steps/chain2/process_egs.sh create mode 100755 evaluation/utility/asr/steps/chain2/randomize_egs.sh create mode 100755 evaluation/utility/asr/steps/chain2/train.sh create mode 100755 evaluation/utility/asr/steps/chain2/validate_processed_egs.sh create mode 100755 evaluation/utility/asr/steps/chain2/validate_randomized_egs.sh create mode 100755 evaluation/utility/asr/steps/chain2/validate_raw_egs.sh create mode 100644 evaluation/utility/asr/steps/cleanup/__init__.py create mode 100755 evaluation/utility/asr/steps/cleanup/clean_and_segment_data.sh create mode 100755 evaluation/utility/asr/steps/cleanup/clean_and_segment_data_nnet3.sh create mode 100755 evaluation/utility/asr/steps/cleanup/combine_short_segments.py create mode 100755 evaluation/utility/asr/steps/cleanup/create_segments_from_ctm.pl create mode 100755 evaluation/utility/asr/steps/cleanup/debug_lexicon.sh create mode 100755 evaluation/utility/asr/steps/cleanup/decode_fmllr_segmentation.sh create mode 100755 evaluation/utility/asr/steps/cleanup/decode_segmentation.sh create mode 100755 evaluation/utility/asr/steps/cleanup/decode_segmentation_nnet3.sh create mode 100755 evaluation/utility/asr/steps/cleanup/find_bad_utts.sh create mode 100755 evaluation/utility/asr/steps/cleanup/find_bad_utts_nnet.sh create mode 100644 evaluation/utility/asr/steps/cleanup/internal/__init__.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/align_ctm_ref.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/compute_tf_idf.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/ctm_to_text.pl create mode 100755 evaluation/utility/asr/steps/cleanup/internal/get_ctm_edits.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/get_non_scored_words.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/get_pron_stats.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/make_one_biased_lm.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/modify_ctm_edits.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/resolve_ctm_edits_overlaps.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/retrieve_similar_docs.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/segment_ctm_edits.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/segment_ctm_edits_mild.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/split_text_into_docs.pl create mode 100755 evaluation/utility/asr/steps/cleanup/internal/stitch_documents.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/taint_ctm_edits.py create mode 100755 evaluation/utility/asr/steps/cleanup/internal/tf_idf.py create mode 100755 evaluation/utility/asr/steps/cleanup/lattice_oracle_align.sh create mode 100755 evaluation/utility/asr/steps/cleanup/make_biased_lm_graphs.sh create mode 100755 evaluation/utility/asr/steps/cleanup/make_biased_lms.py create mode 100755 evaluation/utility/asr/steps/cleanup/make_segmentation_data_dir.sh create mode 100755 evaluation/utility/asr/steps/cleanup/make_segmentation_graph.sh create mode 100755 evaluation/utility/asr/steps/cleanup/make_utterance_fsts.pl create mode 100755 evaluation/utility/asr/steps/cleanup/make_utterance_graph.sh create mode 100755 evaluation/utility/asr/steps/cleanup/segment_long_utterances.sh create mode 100755 evaluation/utility/asr/steps/cleanup/segment_long_utterances_nnet3.sh create mode 100755 evaluation/utility/asr/steps/cleanup/split_long_utterance.sh create mode 100755 evaluation/utility/asr/steps/combine_ali_dirs.sh create mode 100755 evaluation/utility/asr/steps/combine_lat_dirs.sh create mode 100755 evaluation/utility/asr/steps/combine_trans_dirs.sh create mode 100755 evaluation/utility/asr/steps/compare_alignments.sh create mode 100755 evaluation/utility/asr/steps/compute_cmvn_stats.sh create mode 100755 evaluation/utility/asr/steps/compute_vad_decision.sh create mode 100644 evaluation/utility/asr/steps/conf/__init__.py create mode 100755 evaluation/utility/asr/steps/conf/append_eval_to_ctm.py create mode 100755 evaluation/utility/asr/steps/conf/append_prf_to_ctm.py create mode 100755 evaluation/utility/asr/steps/conf/apply_calibration.sh create mode 100755 evaluation/utility/asr/steps/conf/convert_ctm_to_tra.py create mode 100755 evaluation/utility/asr/steps/conf/get_ctm_conf.sh create mode 100755 evaluation/utility/asr/steps/conf/lattice_depth_per_frame.sh create mode 100755 evaluation/utility/asr/steps/conf/parse_arpa_unigrams.py create mode 100755 evaluation/utility/asr/steps/conf/prepare_calibration_data.py create mode 100755 evaluation/utility/asr/steps/conf/prepare_word_categories.py create mode 100755 evaluation/utility/asr/steps/conf/train_calibration.sh create mode 100755 evaluation/utility/asr/steps/copy_ali_dir.sh create mode 100755 evaluation/utility/asr/steps/copy_lat_dir.sh create mode 100755 evaluation/utility/asr/steps/copy_trans_dir.sh create mode 100644 evaluation/utility/asr/steps/data/__init__.py create mode 100755 evaluation/utility/asr/steps/data/augment_data_dir.py create mode 100755 evaluation/utility/asr/steps/data/data_dir_manipulation_lib.py create mode 100755 evaluation/utility/asr/steps/data/make_musan.py create mode 100755 evaluation/utility/asr/steps/data/make_musan.sh create mode 100755 evaluation/utility/asr/steps/data/reverberate_data_dir.py create mode 100755 evaluation/utility/asr/steps/decode.sh create mode 100755 evaluation/utility/asr/steps/decode_basis_fmllr.sh create mode 100755 evaluation/utility/asr/steps/decode_biglm.sh create mode 100755 evaluation/utility/asr/steps/decode_combine.sh create mode 100755 evaluation/utility/asr/steps/decode_fmllr.sh create mode 100755 evaluation/utility/asr/steps/decode_fmllr_extra.sh create mode 100755 evaluation/utility/asr/steps/decode_fmmi.sh create mode 100755 evaluation/utility/asr/steps/decode_fromlats.sh create mode 100755 evaluation/utility/asr/steps/decode_lvtln.sh create mode 100755 evaluation/utility/asr/steps/decode_nnet.sh create mode 100755 evaluation/utility/asr/steps/decode_nolats.sh create mode 100755 evaluation/utility/asr/steps/decode_raw_fmllr.sh create mode 100755 evaluation/utility/asr/steps/decode_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/decode_sgmm2_fromlats.sh create mode 100755 evaluation/utility/asr/steps/decode_sgmm2_rescore.sh create mode 100755 evaluation/utility/asr/steps/decode_sgmm2_rescore_project.sh create mode 100755 evaluation/utility/asr/steps/decode_si.sh create mode 100755 evaluation/utility/asr/steps/decode_with_map.sh create mode 100644 evaluation/utility/asr/steps/diagnostic/__init__.py create mode 100755 evaluation/utility/asr/steps/diagnostic/analyze_alignments.sh create mode 100755 evaluation/utility/asr/steps/diagnostic/analyze_lats.sh create mode 100755 evaluation/utility/asr/steps/diagnostic/analyze_lattice_depth_stats.py create mode 100755 evaluation/utility/asr/steps/diagnostic/analyze_phone_length_stats.py create mode 100644 evaluation/utility/asr/steps/dict/__init__.py create mode 100755 evaluation/utility/asr/steps/dict/apply_g2p.sh create mode 100755 evaluation/utility/asr/steps/dict/apply_g2p_phonetisaurus.sh create mode 100755 evaluation/utility/asr/steps/dict/apply_lexicon_edits.py create mode 100755 evaluation/utility/asr/steps/dict/get_pron_stats.py create mode 100644 evaluation/utility/asr/steps/dict/internal/__init__.py create mode 100755 evaluation/utility/asr/steps/dict/internal/get_subsegments.py create mode 100755 evaluation/utility/asr/steps/dict/internal/prune_pron_candidates.py create mode 100755 evaluation/utility/asr/steps/dict/internal/sum_arc_info.py create mode 100755 evaluation/utility/asr/steps/dict/learn_lexicon_bayesian.sh create mode 100755 evaluation/utility/asr/steps/dict/learn_lexicon_greedy.sh create mode 100755 evaluation/utility/asr/steps/dict/merge_learned_lexicons.py create mode 100755 evaluation/utility/asr/steps/dict/prons_to_lexicon.py create mode 100755 evaluation/utility/asr/steps/dict/prune_pron_candidates.py create mode 100755 evaluation/utility/asr/steps/dict/select_prons_bayesian.py create mode 100755 evaluation/utility/asr/steps/dict/select_prons_greedy.py create mode 100755 evaluation/utility/asr/steps/dict/train_g2p.sh create mode 100755 evaluation/utility/asr/steps/dict/train_g2p_phonetisaurus.sh create mode 100755 evaluation/utility/asr/steps/get_ctm.sh create mode 100755 evaluation/utility/asr/steps/get_ctm_conf.sh create mode 100755 evaluation/utility/asr/steps/get_ctm_conf_fast.sh create mode 100755 evaluation/utility/asr/steps/get_ctm_fast.sh create mode 100755 evaluation/utility/asr/steps/get_fmllr_basis.sh create mode 100755 evaluation/utility/asr/steps/get_lexicon_probs.sh create mode 100755 evaluation/utility/asr/steps/get_prons.sh create mode 100755 evaluation/utility/asr/steps/get_train_ctm.sh create mode 100755 evaluation/utility/asr/steps/info/chain_dir_info.pl create mode 100755 evaluation/utility/asr/steps/info/gmm_dir_info.pl create mode 100755 evaluation/utility/asr/steps/info/nnet2_dir_info.pl create mode 100755 evaluation/utility/asr/steps/info/nnet3_dir_info.pl create mode 100755 evaluation/utility/asr/steps/info/nnet3_disc_dir_info.pl create mode 100755 evaluation/utility/asr/steps/libs/__init__.py create mode 100755 evaluation/utility/asr/steps/libs/common.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/__init__.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/report/__init__.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/report/log_parse.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/train/__init__.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/train/chain_objf/__init__.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/train/chain_objf/acoustic_model.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/train/common.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/train/dropout_schedule.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/train/frame_level_objf/__init__.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/train/frame_level_objf/common.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/train/frame_level_objf/raw_model.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/__init__.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/attention.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/basic_layers.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/composite_layers.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/convolution.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/gru.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/layers.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/lstm.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/parser.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/stats_layer.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/trivial_layers.py create mode 100755 evaluation/utility/asr/steps/libs/nnet3/xconfig/utils.py create mode 100755 evaluation/utility/asr/steps/lmrescore.sh create mode 100755 evaluation/utility/asr/steps/lmrescore_const_arpa.sh create mode 100755 evaluation/utility/asr/steps/lmrescore_const_arpa_undeterminized.sh create mode 100755 evaluation/utility/asr/steps/lmrescore_rnnlm_lat.sh create mode 100755 evaluation/utility/asr/steps/make_denlats.sh create mode 100755 evaluation/utility/asr/steps/make_denlats_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/make_fbank.sh create mode 100755 evaluation/utility/asr/steps/make_fbank_pitch.sh create mode 100755 evaluation/utility/asr/steps/make_index.sh create mode 100755 evaluation/utility/asr/steps/make_mfcc.sh create mode 100755 evaluation/utility/asr/steps/make_mfcc_pitch.sh create mode 100755 evaluation/utility/asr/steps/make_mfcc_pitch_online.sh create mode 100755 evaluation/utility/asr/steps/make_phone_graph.sh create mode 100755 evaluation/utility/asr/steps/make_plp.sh create mode 100755 evaluation/utility/asr/steps/make_plp_pitch.sh create mode 100755 evaluation/utility/asr/steps/nnet/align.sh create mode 100755 evaluation/utility/asr/steps/nnet/decode.sh create mode 100755 evaluation/utility/asr/steps/nnet/ivector/extract_ivectors.sh create mode 100755 evaluation/utility/asr/steps/nnet/ivector/train_diag_ubm.sh create mode 100755 evaluation/utility/asr/steps/nnet/ivector/train_ivector_extractor.sh create mode 100755 evaluation/utility/asr/steps/nnet/make_bn_feats.sh create mode 100755 evaluation/utility/asr/steps/nnet/make_denlats.sh create mode 100755 evaluation/utility/asr/steps/nnet/make_fmllr_feats.sh create mode 100755 evaluation/utility/asr/steps/nnet/make_fmmi_feats.sh create mode 100755 evaluation/utility/asr/steps/nnet/make_priors.sh create mode 100755 evaluation/utility/asr/steps/nnet/pretrain_dbn.sh create mode 100755 evaluation/utility/asr/steps/nnet/train.sh create mode 100755 evaluation/utility/asr/steps/nnet/train_mmi.sh create mode 100755 evaluation/utility/asr/steps/nnet/train_mpe.sh create mode 100755 evaluation/utility/asr/steps/nnet/train_scheduler.sh create mode 100644 evaluation/utility/asr/steps/nnet2/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet2/adjust_priors.sh create mode 100755 evaluation/utility/asr/steps/nnet2/align.sh create mode 100755 evaluation/utility/asr/steps/nnet2/check_ivectors_compatible.sh create mode 100755 evaluation/utility/asr/steps/nnet2/convert_lda_to_raw.sh create mode 100755 evaluation/utility/asr/steps/nnet2/convert_nnet1_to_nnet2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/create_appended_model.sh create mode 100755 evaluation/utility/asr/steps/nnet2/decode.sh create mode 100755 evaluation/utility/asr/steps/nnet2/dump_bottleneck_features.sh create mode 100755 evaluation/utility/asr/steps/nnet2/get_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet2/get_egs2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/get_egs_discriminative2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/get_ivector_id.sh create mode 100755 evaluation/utility/asr/steps/nnet2/get_lda.sh create mode 100755 evaluation/utility/asr/steps/nnet2/get_lda_block.sh create mode 100755 evaluation/utility/asr/steps/nnet2/get_num_frames.sh create mode 100755 evaluation/utility/asr/steps/nnet2/get_perturbed_feats.sh create mode 100755 evaluation/utility/asr/steps/nnet2/make_denlats.sh create mode 100755 evaluation/utility/asr/steps/nnet2/make_multisplice_configs.py create mode 100755 evaluation/utility/asr/steps/nnet2/relabel_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet2/relabel_egs2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/remove_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet2/retrain_fast.sh create mode 100755 evaluation/utility/asr/steps/nnet2/retrain_simple2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/retrain_tanh.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_block.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_convnet_accel2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_discriminative.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_discriminative2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_discriminative_multilang2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_more.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_more2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_multilang2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_multisplice_accel2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_multisplice_ensemble.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_pnorm.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_pnorm_accel2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_pnorm_bottleneck_fast.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_pnorm_ensemble.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_pnorm_fast.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_pnorm_multisplice.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_pnorm_multisplice2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_pnorm_simple.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_pnorm_simple2.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_tanh.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_tanh_bottleneck.sh create mode 100755 evaluation/utility/asr/steps/nnet2/train_tanh_fast.sh create mode 100755 evaluation/utility/asr/steps/nnet2/update_nnet.sh create mode 100644 evaluation/utility/asr/steps/nnet3/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/adjust_priors.sh create mode 100755 evaluation/utility/asr/steps/nnet3/align.sh create mode 100755 evaluation/utility/asr/steps/nnet3/align_lats.sh create mode 100644 evaluation/utility/asr/steps/nnet3/chain/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/build_tree.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain/build_tree_multiple_sources.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain/e2e/README.txt create mode 100644 evaluation/utility/asr/steps/nnet3/chain/e2e/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/e2e/compute_biphone_stats.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/e2e/get_egs_e2e.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain/e2e/prepare_e2e.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain/e2e/text_to_phones.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/e2e/train_e2e.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/gen_topo.pl create mode 100755 evaluation/utility/asr/steps/nnet3/chain/gen_topo.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/gen_topo2.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/gen_topo3.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/gen_topo4.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/gen_topo5.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/gen_topo_orig.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/get_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain/get_model_context.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain/get_phone_post.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain/make_weighted_den_fst.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain/multilingual/combine_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain/train.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain/train_tdnn.sh create mode 100644 evaluation/utility/asr/steps/nnet3/chain2/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/combine_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/compute_preconditioning_matrix.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/get_raw_egs.sh create mode 100644 evaluation/utility/asr/steps/nnet3/chain2/internal/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/internal/get_best_model.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/internal/get_train_schedule.py create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/process_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/randomize_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/train.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/validate_processed_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/validate_randomized_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/chain2/validate_raw_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/components.py create mode 100755 evaluation/utility/asr/steps/nnet3/compute_output.sh create mode 100755 evaluation/utility/asr/steps/nnet3/convert_nnet2_to_nnet3.py create mode 100755 evaluation/utility/asr/steps/nnet3/decode.sh create mode 100755 evaluation/utility/asr/steps/nnet3/decode_grammar.sh create mode 100755 evaluation/utility/asr/steps/nnet3/decode_lookahead.sh create mode 100755 evaluation/utility/asr/steps/nnet3/decode_looped.sh create mode 100755 evaluation/utility/asr/steps/nnet3/decode_score_fusion.sh create mode 100755 evaluation/utility/asr/steps/nnet3/decode_semisup.sh create mode 100644 evaluation/utility/asr/steps/nnet3/dot/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/dot/descriptor_parser.py create mode 100755 evaluation/utility/asr/steps/nnet3/dot/nnet3_to_dot.py create mode 100755 evaluation/utility/asr/steps/nnet3/get_degs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/get_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/get_egs_discriminative.sh create mode 100755 evaluation/utility/asr/steps/nnet3/get_egs_targets.sh create mode 100755 evaluation/utility/asr/steps/nnet3/get_saturation.pl create mode 100755 evaluation/utility/asr/steps/nnet3/get_successful_models.py create mode 100644 evaluation/utility/asr/steps/nnet3/lstm/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/lstm/make_configs.py create mode 100755 evaluation/utility/asr/steps/nnet3/lstm/train.sh create mode 100755 evaluation/utility/asr/steps/nnet3/make_bottleneck_features.sh create mode 100755 evaluation/utility/asr/steps/nnet3/make_denlats.sh create mode 100755 evaluation/utility/asr/steps/nnet3/make_tdnn_configs.py create mode 100644 evaluation/utility/asr/steps/nnet3/multilingual/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/multilingual/allocate_multilingual_examples.py create mode 100755 evaluation/utility/asr/steps/nnet3/multilingual/combine_egs.sh create mode 100755 evaluation/utility/asr/steps/nnet3/nnet3_to_dot.sh create mode 100755 evaluation/utility/asr/steps/nnet3/remove_egs.sh create mode 100644 evaluation/utility/asr/steps/nnet3/report/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/report/convert_model.py create mode 100755 evaluation/utility/asr/steps/nnet3/report/generate_plots.py create mode 100755 evaluation/utility/asr/steps/nnet3/report/summarize_compute_debug_timing.py create mode 100644 evaluation/utility/asr/steps/nnet3/tdnn/__init__.py create mode 100755 evaluation/utility/asr/steps/nnet3/tdnn/make_configs.py create mode 100755 evaluation/utility/asr/steps/nnet3/tdnn/train.sh create mode 100755 evaluation/utility/asr/steps/nnet3/tdnn/train_raw_nnet.sh create mode 100755 evaluation/utility/asr/steps/nnet3/train_discriminative.sh create mode 100755 evaluation/utility/asr/steps/nnet3/train_dnn.py create mode 100755 evaluation/utility/asr/steps/nnet3/train_raw_dnn.py create mode 100755 evaluation/utility/asr/steps/nnet3/train_raw_rnn.py create mode 100755 evaluation/utility/asr/steps/nnet3/train_rnn.py create mode 100755 evaluation/utility/asr/steps/nnet3/train_tdnn.sh create mode 100755 evaluation/utility/asr/steps/nnet3/xconfig_to_config.py create mode 100755 evaluation/utility/asr/steps/nnet3/xconfig_to_configs.py create mode 100755 evaluation/utility/asr/steps/online/decode.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/align.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/copy_data_dir.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/copy_ivector_dir.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/decode.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/dump_nnet_activations.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/extract_ivectors.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/extract_ivectors_online.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/get_egs.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/get_egs2.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/get_egs_discriminative2.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/get_pca_transform.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/make_denlats.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/prepare_online_decoding.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/prepare_online_decoding_retrain.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/prepare_online_decoding_transfer.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/train_diag_ubm.sh create mode 100755 evaluation/utility/asr/steps/online/nnet2/train_ivector_extractor.sh create mode 100755 evaluation/utility/asr/steps/online/nnet3/decode.sh create mode 100755 evaluation/utility/asr/steps/online/nnet3/decode_wake_word.sh create mode 100755 evaluation/utility/asr/steps/online/nnet3/prepare_online_decoding.sh create mode 100755 evaluation/utility/asr/steps/online/prepare_online_decoding.sh create mode 100755 evaluation/utility/asr/steps/oracle_wer.sh create mode 100644 evaluation/utility/asr/steps/overlap/__init__.py create mode 100755 evaluation/utility/asr/steps/overlap/detect_overlaps_pyannote.py create mode 100755 evaluation/utility/asr/steps/overlap/detect_overlaps_pyannote.sh create mode 100755 evaluation/utility/asr/steps/overlap/get_overlap_segments.py create mode 100755 evaluation/utility/asr/steps/overlap/get_overlap_targets.py create mode 100755 evaluation/utility/asr/steps/overlap/output_to_rttm.py create mode 100755 evaluation/utility/asr/steps/overlap/post_process_output.sh create mode 100755 evaluation/utility/asr/steps/overlap/prepare_overlap_graph.py create mode 100755 evaluation/utility/asr/steps/paste_feats.sh create mode 100644 evaluation/utility/asr/steps/pytorchnn/__init__.py create mode 100755 evaluation/utility/asr/steps/pytorchnn/check_py.py create mode 100755 evaluation/utility/asr/steps/pytorchnn/compute_sentence_scores.py create mode 100755 evaluation/utility/asr/steps/pytorchnn/data.py create mode 100755 evaluation/utility/asr/steps/pytorchnn/estimate_arc_nnlm_scores.py create mode 100755 evaluation/utility/asr/steps/pytorchnn/lmrescore_lattice_pytorchnn.sh create mode 100755 evaluation/utility/asr/steps/pytorchnn/lmrescore_nbest_pytorchnn.sh create mode 100755 evaluation/utility/asr/steps/pytorchnn/model.py create mode 100755 evaluation/utility/asr/steps/pytorchnn/train.py create mode 100755 evaluation/utility/asr/steps/resegment_data.sh create mode 100755 evaluation/utility/asr/steps/resegment_text.sh create mode 100755 evaluation/utility/asr/steps/rnnlmrescore.sh create mode 100755 evaluation/utility/asr/steps/score_kaldi.sh create mode 100755 evaluation/utility/asr/steps/score_kaldi_compare.sh create mode 100755 evaluation/utility/asr/steps/scoring/score_kaldi_cer.sh create mode 100755 evaluation/utility/asr/steps/scoring/score_kaldi_compare.sh create mode 100755 evaluation/utility/asr/steps/scoring/score_kaldi_wer.sh create mode 100755 evaluation/utility/asr/steps/search_index.sh create mode 100644 evaluation/utility/asr/steps/segmentation/__init__.py create mode 100755 evaluation/utility/asr/steps/segmentation/ali_to_targets.sh create mode 100755 evaluation/utility/asr/steps/segmentation/combine_targets_dirs.sh create mode 100755 evaluation/utility/asr/steps/segmentation/convert_targets_dir_to_whole_recording.sh create mode 100755 evaluation/utility/asr/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py create mode 100755 evaluation/utility/asr/steps/segmentation/copy_targets_dir.sh create mode 100755 evaluation/utility/asr/steps/segmentation/decode_sad.sh create mode 100755 evaluation/utility/asr/steps/segmentation/detect_speech_activity.sh create mode 100755 evaluation/utility/asr/steps/segmentation/evaluate_segmentation.pl create mode 100755 evaluation/utility/asr/steps/segmentation/get_targets_for_out_of_segments.sh create mode 100644 evaluation/utility/asr/steps/segmentation/internal/__init__.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/arc_info_to_targets.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/find_oov_phone.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/get_default_targets_for_out_of_segments.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/get_transform_probs_mat.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/merge_segment_targets_to_recording.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/merge_targets.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/prepare_sad_graph.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/resample_targets.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/sad_to_segments.py create mode 100755 evaluation/utility/asr/steps/segmentation/internal/verify_phones_list.py create mode 100755 evaluation/utility/asr/steps/segmentation/lats_to_targets.sh create mode 100755 evaluation/utility/asr/steps/segmentation/merge_targets_dirs.sh create mode 100755 evaluation/utility/asr/steps/segmentation/post_process_sad_to_segments.sh create mode 100755 evaluation/utility/asr/steps/segmentation/prepare_targets_gmm.sh create mode 100755 evaluation/utility/asr/steps/segmentation/resample_targets_dir.sh create mode 100755 evaluation/utility/asr/steps/segmentation/validate_targets_dir.sh create mode 100755 evaluation/utility/asr/steps/select_feats.sh create mode 100755 evaluation/utility/asr/steps/shift_feats.sh create mode 100755 evaluation/utility/asr/steps/subset_ali_dir.sh create mode 100755 evaluation/utility/asr/steps/tandem/align_fmllr.sh create mode 100755 evaluation/utility/asr/steps/tandem/align_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/tandem/align_si.sh create mode 100755 evaluation/utility/asr/steps/tandem/decode.sh create mode 100755 evaluation/utility/asr/steps/tandem/decode_fmllr.sh create mode 100755 evaluation/utility/asr/steps/tandem/decode_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/tandem/decode_si.sh create mode 100755 evaluation/utility/asr/steps/tandem/make_denlats.sh create mode 100755 evaluation/utility/asr/steps/tandem/make_denlats_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/tandem/mk_aslf_lda_mllt.sh create mode 100755 evaluation/utility/asr/steps/tandem/mk_aslf_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/tandem/train_deltas.sh create mode 100755 evaluation/utility/asr/steps/tandem/train_lda_mllt.sh create mode 100755 evaluation/utility/asr/steps/tandem/train_mllt.sh create mode 100755 evaluation/utility/asr/steps/tandem/train_mmi.sh create mode 100755 evaluation/utility/asr/steps/tandem/train_mmi_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/tandem/train_mono.sh create mode 100755 evaluation/utility/asr/steps/tandem/train_sat.sh create mode 100755 evaluation/utility/asr/steps/tandem/train_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/tandem/train_ubm.sh create mode 100644 evaluation/utility/asr/steps/tfrnnlm/__init__.py create mode 100755 evaluation/utility/asr/steps/tfrnnlm/check_py.py create mode 100755 evaluation/utility/asr/steps/tfrnnlm/check_tensorflow_installed.sh create mode 100755 evaluation/utility/asr/steps/tfrnnlm/lmrescore_rnnlm_lat.sh create mode 100755 evaluation/utility/asr/steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh create mode 100755 evaluation/utility/asr/steps/tfrnnlm/lstm.py create mode 100755 evaluation/utility/asr/steps/tfrnnlm/lstm_fast.py create mode 100755 evaluation/utility/asr/steps/tfrnnlm/reader.py create mode 100755 evaluation/utility/asr/steps/tfrnnlm/vanilla_rnnlm.py create mode 100755 evaluation/utility/asr/steps/train_deltas.sh create mode 100755 evaluation/utility/asr/steps/train_diag_ubm.sh create mode 100755 evaluation/utility/asr/steps/train_lda_mllt.sh create mode 100755 evaluation/utility/asr/steps/train_lvtln.sh create mode 100755 evaluation/utility/asr/steps/train_map.sh create mode 100755 evaluation/utility/asr/steps/train_mmi.sh create mode 100755 evaluation/utility/asr/steps/train_mmi_fmmi.sh create mode 100755 evaluation/utility/asr/steps/train_mmi_fmmi_indirect.sh create mode 100755 evaluation/utility/asr/steps/train_mmi_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/train_mono.sh create mode 100755 evaluation/utility/asr/steps/train_mpe.sh create mode 100755 evaluation/utility/asr/steps/train_nnet.sh create mode 100755 evaluation/utility/asr/steps/train_quick.sh create mode 100755 evaluation/utility/asr/steps/train_raw_sat.sh create mode 100755 evaluation/utility/asr/steps/train_sat.sh create mode 100755 evaluation/utility/asr/steps/train_sat_basis.sh create mode 100755 evaluation/utility/asr/steps/train_segmenter.sh create mode 100755 evaluation/utility/asr/steps/train_sgmm2.sh create mode 100755 evaluation/utility/asr/steps/train_sgmm2_group.sh create mode 100755 evaluation/utility/asr/steps/train_smbr.sh create mode 100755 evaluation/utility/asr/steps/train_ubm.sh create mode 100755 evaluation/utility/asr/steps/word_align_lattices.sh create mode 100644 evaluation/utility/asr/utils/__init__.py create mode 100755 evaluation/utility/asr/utils/add_disambig.pl create mode 100755 evaluation/utility/asr/utils/add_lex_disambig.pl create mode 100755 evaluation/utility/asr/utils/analyze_segments.pl create mode 100755 evaluation/utility/asr/utils/apply_map.pl create mode 100755 evaluation/utility/asr/utils/best_wer.sh create mode 100755 evaluation/utility/asr/utils/build_const_arpa_lm.sh create mode 100755 evaluation/utility/asr/utils/build_kenlm_model_from_arpa.sh create mode 100755 evaluation/utility/asr/utils/combine_data.sh create mode 100755 evaluation/utility/asr/utils/convert_ctm.pl create mode 100755 evaluation/utility/asr/utils/convert_slf.pl create mode 100755 evaluation/utility/asr/utils/convert_slf_parallel.sh create mode 100755 evaluation/utility/asr/utils/copy_data_dir.sh create mode 100755 evaluation/utility/asr/utils/create_data_link.pl create mode 100755 evaluation/utility/asr/utils/create_split_dir.pl create mode 100644 evaluation/utility/asr/utils/ctm/__init__.py create mode 100755 evaluation/utility/asr/utils/ctm/convert_ctm.pl create mode 100755 evaluation/utility/asr/utils/ctm/fix_ctm.sh create mode 100755 evaluation/utility/asr/utils/ctm/resolve_ctm_overlaps.py create mode 100644 evaluation/utility/asr/utils/data/__init__.py create mode 100755 evaluation/utility/asr/utils/data/combine_data.sh create mode 100755 evaluation/utility/asr/utils/data/combine_short_segments.sh create mode 100755 evaluation/utility/asr/utils/data/convert_data_dir_to_whole.sh create mode 100755 evaluation/utility/asr/utils/data/copy_data_dir.sh create mode 100755 evaluation/utility/asr/utils/data/extend_segment_times.py create mode 100755 evaluation/utility/asr/utils/data/extract_wav_segments_data_dir.sh create mode 100755 evaluation/utility/asr/utils/data/fix_data_dir.sh create mode 100755 evaluation/utility/asr/utils/data/fix_subsegment_feats.pl create mode 100755 evaluation/utility/asr/utils/data/get_allowed_durations.py create mode 100755 evaluation/utility/asr/utils/data/get_frame_shift.sh create mode 100755 evaluation/utility/asr/utils/data/get_num_frames.sh create mode 100755 evaluation/utility/asr/utils/data/get_reco2dur.sh create mode 100755 evaluation/utility/asr/utils/data/get_reco2utt_for_data.sh create mode 100755 evaluation/utility/asr/utils/data/get_segments_for_data.sh create mode 100755 evaluation/utility/asr/utils/data/get_uniform_subsegments.py create mode 100755 evaluation/utility/asr/utils/data/get_utt2dur.sh create mode 100755 evaluation/utility/asr/utils/data/get_utt2num_frames.sh create mode 100644 evaluation/utility/asr/utils/data/internal/__init__.py create mode 100755 evaluation/utility/asr/utils/data/internal/choose_utts_to_combine.py create mode 100755 evaluation/utility/asr/utils/data/internal/combine_segments_to_recording.py create mode 100755 evaluation/utility/asr/utils/data/internal/modify_speaker_info.py create mode 100755 evaluation/utility/asr/utils/data/internal/perturb_volume.py create mode 100755 evaluation/utility/asr/utils/data/limit_feature_dim.sh create mode 100755 evaluation/utility/asr/utils/data/modify_speaker_info.sh create mode 100755 evaluation/utility/asr/utils/data/modify_speaker_info_to_recording.sh create mode 100755 evaluation/utility/asr/utils/data/normalize_data_range.pl create mode 100755 evaluation/utility/asr/utils/data/perturb_data_dir_speed.sh create mode 100755 evaluation/utility/asr/utils/data/perturb_data_dir_speed_3way.sh create mode 100755 evaluation/utility/asr/utils/data/perturb_data_dir_volume.sh create mode 100755 evaluation/utility/asr/utils/data/perturb_speed_to_allowed_lengths.py create mode 100755 evaluation/utility/asr/utils/data/remove_dup_utts.sh create mode 100755 evaluation/utility/asr/utils/data/resample_data_dir.sh create mode 100755 evaluation/utility/asr/utils/data/shift_and_combine_feats.sh create mode 100755 evaluation/utility/asr/utils/data/shift_feats.sh create mode 100755 evaluation/utility/asr/utils/data/split_data.sh create mode 100755 evaluation/utility/asr/utils/data/subsegment_data_dir.sh create mode 100755 evaluation/utility/asr/utils/data/subset_data_dir.sh create mode 100755 evaluation/utility/asr/utils/data/validate_data_dir.sh create mode 100755 evaluation/utility/asr/utils/dict_dir_add_pronprobs.sh create mode 100755 evaluation/utility/asr/utils/eps2disambig.pl create mode 100755 evaluation/utility/asr/utils/filt.py create mode 100755 evaluation/utility/asr/utils/filter_scp.pl create mode 100755 evaluation/utility/asr/utils/filter_scps.pl create mode 100755 evaluation/utility/asr/utils/find_arpa_oovs.pl create mode 100755 evaluation/utility/asr/utils/fix_ctm.sh create mode 100755 evaluation/utility/asr/utils/fix_data_dir.sh create mode 100755 evaluation/utility/asr/utils/format_lm.sh create mode 100755 evaluation/utility/asr/utils/format_lm_sri.sh create mode 100755 evaluation/utility/asr/utils/gen_topo.pl create mode 100755 evaluation/utility/asr/utils/generate_selected_speakers.py create mode 100755 evaluation/utility/asr/utils/int2sym.pl create mode 100755 evaluation/utility/asr/utils/kwslist_post_process.pl create mode 100644 evaluation/utility/asr/utils/lang/__init__.py create mode 100755 evaluation/utility/asr/utils/lang/add_lex_disambig.pl create mode 100755 evaluation/utility/asr/utils/lang/add_unigrams_arpa.pl create mode 100755 evaluation/utility/asr/utils/lang/adjust_unk_arpa.pl create mode 100755 evaluation/utility/asr/utils/lang/adjust_unk_graph.sh create mode 100644 evaluation/utility/asr/utils/lang/bpe/__init__.py create mode 100755 evaluation/utility/asr/utils/lang/bpe/add_final_optional_silence.sh create mode 100755 evaluation/utility/asr/utils/lang/bpe/apply_bpe.py create mode 100755 evaluation/utility/asr/utils/lang/bpe/bidi.py create mode 100755 evaluation/utility/asr/utils/lang/bpe/learn_bpe.py create mode 100755 evaluation/utility/asr/utils/lang/bpe/prepend_words.py create mode 100755 evaluation/utility/asr/utils/lang/bpe/reverse.py create mode 100755 evaluation/utility/asr/utils/lang/check_g_properties.pl create mode 100755 evaluation/utility/asr/utils/lang/check_phones_compatible.sh create mode 100755 evaluation/utility/asr/utils/lang/compute_sentence_probs_arpa.py create mode 100755 evaluation/utility/asr/utils/lang/extend_lang.sh create mode 100755 evaluation/utility/asr/utils/lang/get_word_position_phone_map.pl create mode 100644 evaluation/utility/asr/utils/lang/grammar/__init__.py create mode 100755 evaluation/utility/asr/utils/lang/grammar/augment_phones_txt.py create mode 100755 evaluation/utility/asr/utils/lang/grammar/augment_words_txt.py create mode 100644 evaluation/utility/asr/utils/lang/internal/__init__.py create mode 100755 evaluation/utility/asr/utils/lang/internal/apply_unk_lm.sh create mode 100755 evaluation/utility/asr/utils/lang/internal/arpa2fst_constrained.py create mode 100755 evaluation/utility/asr/utils/lang/internal/modify_unk_pron.py create mode 100755 evaluation/utility/asr/utils/lang/limit_arpa_unk_history.py create mode 100755 evaluation/utility/asr/utils/lang/make_kn_lm.py create mode 100755 evaluation/utility/asr/utils/lang/make_lexicon_fst.py create mode 100755 evaluation/utility/asr/utils/lang/make_lexicon_fst_silprob.py create mode 100755 evaluation/utility/asr/utils/lang/make_phone_bigram_lang.sh create mode 100755 evaluation/utility/asr/utils/lang/make_phone_lm.py create mode 100755 evaluation/utility/asr/utils/lang/make_position_dependent_subword_lexicon.py create mode 100755 evaluation/utility/asr/utils/lang/make_subword_lexicon_fst.py create mode 100755 evaluation/utility/asr/utils/lang/make_unk_lm.sh create mode 100755 evaluation/utility/asr/utils/lang/ngram_entropy_pruning.py create mode 100755 evaluation/utility/asr/utils/lang/prepare_lang.sh create mode 100755 evaluation/utility/asr/utils/lang/validate_disambig_sym_file.pl create mode 100755 evaluation/utility/asr/utils/lang/validate_lang.pl create mode 100755 evaluation/utility/asr/utils/ln.pl create mode 100755 evaluation/utility/asr/utils/make_absolute.sh create mode 100755 evaluation/utility/asr/utils/make_lexicon_fst.pl create mode 100755 evaluation/utility/asr/utils/make_lexicon_fst_silprob.pl create mode 100755 evaluation/utility/asr/utils/make_unigram_grammar.pl create mode 100755 evaluation/utility/asr/utils/map_arpa_lm.pl create mode 100755 evaluation/utility/asr/utils/mkgraph.sh create mode 100755 evaluation/utility/asr/utils/mkgraph_lookahead.sh create mode 100755 evaluation/utility/asr/utils/nnet-cpu/make_nnet_config.pl create mode 100755 evaluation/utility/asr/utils/nnet-cpu/make_nnet_config_block.pl create mode 100755 evaluation/utility/asr/utils/nnet-cpu/make_nnet_config_preconditioned.pl create mode 100755 evaluation/utility/asr/utils/nnet-cpu/update_learning_rates.pl create mode 100644 evaluation/utility/asr/utils/nnet/__init__.py create mode 100755 evaluation/utility/asr/utils/nnet/gen_dct_mat.py create mode 100755 evaluation/utility/asr/utils/nnet/gen_hamm_mat.py create mode 100755 evaluation/utility/asr/utils/nnet/gen_splice.py create mode 100755 evaluation/utility/asr/utils/nnet/make_blstm_proto.py create mode 100755 evaluation/utility/asr/utils/nnet/make_cnn_proto.py create mode 100755 evaluation/utility/asr/utils/nnet/make_lstm_proto.py create mode 100755 evaluation/utility/asr/utils/nnet/make_nnet_proto.py create mode 100755 evaluation/utility/asr/utils/nnet/subset_data_tr_cv.sh create mode 100644 evaluation/utility/asr/utils/nnet3/__init__.py create mode 100755 evaluation/utility/asr/utils/nnet3/convert_config_tdnn_to_affine.py create mode 100755 evaluation/utility/asr/utils/parallel/limit_num_gpus.sh create mode 100755 evaluation/utility/asr/utils/parallel/pbs.pl create mode 100755 evaluation/utility/asr/utils/parallel/queue.pl create mode 100755 evaluation/utility/asr/utils/parallel/retry.pl create mode 100755 evaluation/utility/asr/utils/parallel/run.pl create mode 100755 evaluation/utility/asr/utils/parallel/slurm.pl create mode 100755 evaluation/utility/asr/utils/parse_options.sh create mode 100755 evaluation/utility/asr/utils/pbs.pl create mode 100755 evaluation/utility/asr/utils/perturb_data_dir_speed.sh create mode 100755 evaluation/utility/asr/utils/pinyin_map.pl create mode 100755 evaluation/utility/asr/utils/prepare_extended_lang.sh create mode 100755 evaluation/utility/asr/utils/prepare_lang.sh create mode 100755 evaluation/utility/asr/utils/prepare_online_nnet_dist_build.sh create mode 100755 evaluation/utility/asr/utils/queue.pl create mode 100755 evaluation/utility/asr/utils/remove_data_links.sh create mode 100755 evaluation/utility/asr/utils/remove_oovs.pl create mode 100755 evaluation/utility/asr/utils/require_argument.sh create mode 100755 evaluation/utility/asr/utils/require_argument_all.sh create mode 100755 evaluation/utility/asr/utils/retry.pl create mode 100755 evaluation/utility/asr/utils/reverse_arpa.py create mode 100755 evaluation/utility/asr/utils/rnnlm_compute_scores.sh create mode 100755 evaluation/utility/asr/utils/run.pl create mode 100755 evaluation/utility/asr/utils/s2eps.pl create mode 100755 evaluation/utility/asr/utils/scoring/wer_ops_details.pl create mode 100755 evaluation/utility/asr/utils/scoring/wer_per_spk_details.pl create mode 100755 evaluation/utility/asr/utils/scoring/wer_per_utt_details.pl create mode 100755 evaluation/utility/asr/utils/scoring/wer_report.pl create mode 100755 evaluation/utility/asr/utils/segmentation.pl create mode 100755 evaluation/utility/asr/utils/show_lattice.sh create mode 100755 evaluation/utility/asr/utils/shuffle_list.pl create mode 100755 evaluation/utility/asr/utils/slurm.pl create mode 100755 evaluation/utility/asr/utils/spk2utt_to_utt2spk.pl create mode 100755 evaluation/utility/asr/utils/split_data.sh create mode 100755 evaluation/utility/asr/utils/split_scp.pl create mode 100755 evaluation/utility/asr/utils/ssh.pl create mode 100755 evaluation/utility/asr/utils/subset_data_dir.sh create mode 100755 evaluation/utility/asr/utils/subset_data_dir_tr_cv.sh create mode 100755 evaluation/utility/asr/utils/subset_scp.pl create mode 100755 evaluation/utility/asr/utils/subword/prepare_lang_subword.sh create mode 100755 evaluation/utility/asr/utils/subword/prepare_subword_text.sh create mode 100755 evaluation/utility/asr/utils/summarize_logs.pl create mode 100755 evaluation/utility/asr/utils/summarize_warnings.pl create mode 100755 evaluation/utility/asr/utils/sym2int.pl create mode 100755 evaluation/utility/asr/utils/train_arpa_with_kenlm.sh create mode 100755 evaluation/utility/asr/utils/utt2spk_to_spk2utt.pl create mode 100755 evaluation/utility/asr/utils/validate_data_dir.sh create mode 100755 evaluation/utility/asr/utils/validate_dict_dir.pl create mode 100755 evaluation/utility/asr/utils/validate_lang.pl create mode 100755 evaluation/utility/asr/utils/validate_text.pl create mode 100755 evaluation/utility/asr/utils/write_kwslist.pl create mode 100644 evaluation/utility/voice_distinctiveness/__init__.py create mode 100644 evaluation/utility/voice_distinctiveness/deid_gvd.py create mode 100644 figures/framework.png create mode 100644 requirements.txt create mode 100644 run_anonymization.py create mode 100644 run_evaluation.py create mode 100644 utils/__init__.py create mode 100644 utils/convert_results.py create mode 100644 utils/data_io.py create mode 100644 utils/path_management.py create mode 100644 utils/prepare_results_in_kaldi_format.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6e4761 --- /dev/null +++ b/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/README.md b/README.md index 2dc532d..3874d05 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,86 @@ -# VoicePAT -VoicePAT is a modular and efficient toolkit for voice privacy research, with main focus on speaker anonymization. +# VoicePAT: Voice Privacy Anonymization Toolkit + +**Note: This repository and its documentation are still under construction but can already be used for both +anonymization and evaluation.** + +VoicePAT is a toolkit for speaker anonymization research, with special focus on speaker anonymization. +It is based on the framework(s) by the [VoicePrivacy Challenges](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022) but contains the following improvements: + +* It consists of **two separate procedures for anonymization and evaluation**. This means that the generation of + anonymized speech is independent of the evaluation of anonymization systems. Both processes do not need to be + executed in the same run or with the same settings. Of course, you need to perform the anonymization of evaluation + data with one system before you can evaluate it but this could have happened at an earlier time and with an + external codebase. +* Anonymization and evaluation procedures are **structured as pipelines** consisting of separate **modules**. Each + module may have a selection of different models or algorithm to fulfill its role. The settings for each procedure + / pipeline are defined exclusively in configuration files. See the *Usage* section below for more information. +* **Evaluation models** have been exchanged by models based on [SpeechBrain](https://github.com/speechbrain/speechbrain/) and [ESPnet](https://github.com/espnet/espnet/) which are **more powerful** than the + previous Kaldi-based models. Furthermore, we added new techniques to make evaluation significantly **more + efficient**. +* The framework is written in **Python**, making it easy to include and adapt other Python-based models, e.g., using + PyTorch. When using the framework, you do not need in-depth knowledge about anything outside the Python realm + (Disclaimer: While being written in Python, the ASR evaluation is currently included with an ESPnet-based model + which in turn is based on Kaldi. However, you do not need to modify that part of the code for using or + changing the ASR model and ESPnet is currently working on a Kaldi-free version.) + + +## Installation +Simply clone the repository and install the dependencies in [requirements.txt](requirements.txt). If you want to use +the ESPnet-based ASR evaluation model, you additionally need to clone and install [ESPNet](https://github.com/espnet/espnet/) and insert the link to +it in [evaluation/utility/asr/path.sh](evaluation/utility/asr/path.sh), e.g., ``MAIN_ROOT=~/espnet``. + +## Usage + +![](figures/framework.png) + +For using the toolkit with the existing methods, you can use the configuration files in [configs](configs). You can +also add more modules and models to the code and create your own config by using the existing ones as template. + + +### Anonymization +The framework currently contains only one pipeline and config for anonymization, [anon_ims_sttts_pc.yaml](configs/anon_ims_sttts_pc.yaml). If you are using this config, you need to modify at least the following entries: +``` +data_dir: path to original data in Kaldi-format for anonymization +results_dir: path to location for all (intermediate) results of the anonymization +models_dir: path to models location +``` + +Running an anonymization pipeline is done like this: +``` +python run_anonymization.py --config anon_ims_sttts_pc.yaml --gpu_ids 0,1 --force_compute +``` +This will perform all computations that support parallel computing on the gpus with ID 0 and 1, and on GPU 0 +otherwise. If no gpu_ids are specified, it will run only on GPU 0 or CPU, depending on whether cuda is available. +`--force_compute` causes all previous computations to be run again. In most cases, you can delete that flag from the +command to speed up the anonymization. + +Pretrained models for this anonymization can be found at [https://github. +com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0](https://github.com/DigitalPhonetics/speaker-anonymization/releases/tag/v2.0) and earlier releases. + +### Evaluation +All other config files in [configs](configs) can be used for evaluation with different settings. In these configs, +you need to adapt at least +``` +eval_data_dir: path to anonymized evaluation data in Kaldi-format +asr/libri_dir: path to original LibriSpeech dataset +``` + +Running an evaluation pipeline is done like this: +``` +python run_evaluation.py --config eval_pre_ecapa_cos.yaml --gpu_ids 1,2,3 +``` +making the GPUs with IDs 1, 2 and 3 available to the process. If no GPU is specified, it will default to CUDA:0 or +use all GPUs if +cuda is available, or run on CPU otherwise. + +Pretrained evaluation models can be found in release v1. + +## Acknowledgements +Several parts of this toolkit are based on or use code from external sources, i.e., +* [VoicePrivacy Challenge 2022](https://github.com/Voice-Privacy-Challenge/Voice-Privacy-Challenge-2022), [ESPnet](https://github.com/espnet/espnet/), [SpeechBrain](https://github.com/speechbrain/speechbrain/) for evaluation +* the [GAN-based anonymization system by IMS (University of Stuttgart)](https://github.com/DigitalPhonetics/speaker-anonymization) + for + anonymization + +See the READMEs for [anonymization](anonymization/README.md) and [evaluation](evaluation/README.md) for more +information. \ No newline at end of file diff --git a/anonymization/README.md b/anonymization/README.md new file mode 100644 index 0000000..7006f3c --- /dev/null +++ b/anonymization/README.md @@ -0,0 +1,7 @@ +# Anonymization + +The anonymization branch can contain multiple pipelines, modules and models. So far, the only pipeline added is the +[Speech-to-Text-to-Speech (STTTS) pipeline](https://ieeexplore.ieee.org/document/10096607), based on this code: +[https://github.com/DigitalPhonetics/speaker-anonymization](https://github.com/DigitalPhonetics/speaker-anonymization). + +*This documentation is still under construction and will be extended soon.* \ No newline at end of file diff --git a/anonymization/__init__.py b/anonymization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/anonymization/modules/__init__.py b/anonymization/modules/__init__.py new file mode 100644 index 0000000..740e328 --- /dev/null +++ b/anonymization/modules/__init__.py @@ -0,0 +1,4 @@ +from .tts import SpeechSynthesis +from .text import SpeechRecognition +from .prosody import ProsodyExtraction, ProsodyAnonymization +from .speaker_embeddings import SpeakerExtraction, SpeakerAnonymization diff --git a/anonymization/modules/prosody/__init__.py b/anonymization/modules/prosody/__init__.py new file mode 100644 index 0000000..65d7b50 --- /dev/null +++ b/anonymization/modules/prosody/__init__.py @@ -0,0 +1,2 @@ +from .prosody_extraction import ProsodyExtraction +from .prosody_anonymization import ProsodyAnonymization \ No newline at end of file diff --git a/anonymization/modules/prosody/anonymization/__init__.py b/anonymization/modules/prosody/anonymization/__init__.py new file mode 100644 index 0000000..357c9ce --- /dev/null +++ b/anonymization/modules/prosody/anonymization/__init__.py @@ -0,0 +1 @@ +from .ims_prosody_anonymization import ImsProsodyAnonymization \ No newline at end of file diff --git a/anonymization/modules/prosody/anonymization/ims_prosody_anonymization.py b/anonymization/modules/prosody/anonymization/ims_prosody_anonymization.py new file mode 100644 index 0000000..c205857 --- /dev/null +++ b/anonymization/modules/prosody/anonymization/ims_prosody_anonymization.py @@ -0,0 +1,29 @@ +import torch + + +class ImsProsodyAnonymization: + + def __init__(self, random_offset_lower, random_offset_higher): + self.random_offset_lower = random_offset_lower + self.random_offset_higher = random_offset_higher + + def anonymize_values(self, duration, energy, pitch, *kwargs): + if self.random_offset_lower is not None and self.random_offset_higher is not None: + scales = torch.randint(low=self.random_offset_lower, high=self.random_offset_higher, + size=energy.size()).float() / 100 + energy = energy * scales + + if self.random_offset_lower is not None and self.random_offset_higher is not None: + scales = torch.randint(low=self.random_offset_lower, high=self.random_offset_higher, + size=pitch.size()).float() / 100 + pitch = pitch * scales + + return_dict = { + 'duration': duration, + 'energy': energy, + 'pitch': pitch + } + + return_dict.update(kwargs) + + return return_dict diff --git a/anonymization/modules/prosody/extraction/__init__.py b/anonymization/modules/prosody/extraction/__init__.py new file mode 100644 index 0000000..187a6a4 --- /dev/null +++ b/anonymization/modules/prosody/extraction/__init__.py @@ -0,0 +1 @@ +from .ims_prosody_extraction import ImsProsodyExtractor \ No newline at end of file diff --git a/anonymization/modules/prosody/extraction/ims_prosody_extraction.py b/anonymization/modules/prosody/extraction/ims_prosody_extraction.py new file mode 100644 index 0000000..cc700af --- /dev/null +++ b/anonymization/modules/prosody/extraction/ims_prosody_extraction.py @@ -0,0 +1,153 @@ +import torch +torch.set_num_threads(1) + +from torch.optim import SGD +import soundfile as sf + +from anonymization.modules.tts.IMSToucan.Preprocessing.AudioPreprocessor import AudioPreprocessor +from anonymization.modules.tts.IMSToucan.Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend +from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner +from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator +from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator +from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Parselmouth + + +class ImsProsodyExtractor: + + def __init__(self, aligner_path, device, on_line_fine_tune=True, random_offset_lower=None, + random_offset_higher=None): + self.on_line_fine_tune = on_line_fine_tune + self.random_offset_lower = random_offset_lower + self.random_offset_higher = random_offset_higher + + self.ap = AudioPreprocessor(input_sr=16000, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False) + self.tf = ArticulatoryCombinedTextFrontend(language="en") + self.device = device + self.aligner_weights = torch.load(aligner_path, map_location='cpu')["asr_model"] + torch.hub._validate_not_a_forked_repo = lambda a, b, c: True # torch 1.9 has a bug in the hub loading, this is a workaround + # careful: assumes 16kHz or 8kHz audio + self.silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', + model='silero_vad', + force_reload=False, + onnx=False, + verbose=False) + (self.get_speech_timestamps, _, _, _, _) = utils + torch.set_grad_enabled(True) # finding this issue was very infuriating: silero sets + # this to false globally during model loading rather than using inference mode or no_grad + + def extract_prosody(self, + transcript, + ref_audio_path, + lang="en", + input_is_phones=False): + acoustic_model = Aligner() + acoustic_model.load_state_dict(self.aligner_weights) + acoustic_model = acoustic_model.to(self.device) + parsel = Parselmouth(reduction_factor=1, fs=16000) + energy_calc = EnergyCalculator(reduction_factor=1, fs=16000) + dc = DurationCalculator(reduction_factor=1) + wave, sr = sf.read(ref_audio_path) + if self.tf.language != lang: + self.tf = ArticulatoryCombinedTextFrontend(language=lang) + if self.ap.sr != sr: + self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False) + try: + norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave) + except ValueError: + print('Something went wrong, the reference wave might be too short.') + raise RuntimeError + + with torch.inference_mode(): + speech_timestamps = self.get_speech_timestamps(norm_wave, self.silero_model, sampling_rate=16000) + start_silence = speech_timestamps[0]['start'] + end_silence = len(norm_wave) - speech_timestamps[-1]['end'] + norm_wave = norm_wave[speech_timestamps[0]['start']:speech_timestamps[-1]['end']] + + norm_wave_length = torch.LongTensor([len(norm_wave)]) + text = self.tf.string_to_tensor(transcript, handle_missing=True, input_phonemes=input_is_phones).squeeze( + 0) + melspec = self.ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1) + melspec_length = torch.LongTensor([len(melspec)]).numpy() + + if self.on_line_fine_tune: + # we fine-tune the aligner for a couple steps using SGD. This makes cloning pretty slow, but the results are greatly improved. + steps = 3 + tokens = list() # we need an ID sequence for training rather than a sequence of phonological features + for vector in text: + if vector[21] == 0: # we don't include word boundaries when performing alignment, since they are not always present in audio. + for phone in self.tf.phone_to_vector: + if vector.numpy().tolist()[13:] == self.tf.phone_to_vector[phone][13:]: + # the first 12 dimensions are for modifiers, so we ignore those when trying to find the phoneme in the ID lookup + tokens.append(self.tf.phone_to_id[phone]) + # this is terribly inefficient, but it's fine + break + tokens = torch.LongTensor(tokens).squeeze().to(self.device) + tokens_len = torch.LongTensor([len(tokens)]).to(self.device) + mel = melspec.unsqueeze(0).to(self.device) + mel.requires_grad = True + mel_len = torch.LongTensor([len(mel[0])]).to(self.device) + # actual fine-tuning starts here + optim_asr = SGD(acoustic_model.parameters(), lr=0.1) + acoustic_model.train() + for _ in list(range(steps)): + pred = acoustic_model(mel) + loss = acoustic_model.ctc_loss(pred.transpose(0, 1).log_softmax(2), tokens, mel_len, tokens_len) + optim_asr.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(acoustic_model.parameters(), 1.0) + optim_asr.step() + acoustic_model.eval() + + # We deal with the word boundaries by having 2 versions of text: with and without word boundaries. + # We note the index of word boundaries and insert durations of 0 afterwards + text_without_word_boundaries = list() + indexes_of_word_boundaries = list() + for phoneme_index, vector in enumerate(text): + if vector[21] == 0: + text_without_word_boundaries.append(vector.numpy().tolist()) + else: + indexes_of_word_boundaries.append(phoneme_index) + matrix_without_word_boundaries = torch.Tensor(text_without_word_boundaries) + + alignment_path = acoustic_model.inference(mel=melspec.to(self.device), + tokens=matrix_without_word_boundaries.to(self.device), + return_ctc=False) + + duration = dc(torch.LongTensor(alignment_path), vis=None).cpu() + + for index_of_word_boundary in indexes_of_word_boundaries: + duration = torch.cat([duration[:index_of_word_boundary], + torch.LongTensor([0]), # insert a 0 duration wherever there is a word boundary + duration[index_of_word_boundary:]]) + + last_vec = None + for phoneme_index, vec in enumerate(text): + if last_vec is not None: + if last_vec.numpy().tolist() == vec.numpy().tolist(): + # we found a case of repeating phonemes! + # now we must repair their durations by giving the first one 3/5 of their sum and the second one 2/5 (i.e. the rest) + dur_1 = duration[phoneme_index - 1] + dur_2 = duration[phoneme_index] + total_dur = dur_1 + dur_2 + new_dur_1 = int((total_dur / 5) * 3) + new_dur_2 = total_dur - new_dur_1 + duration[phoneme_index - 1] = new_dur_1 + duration[phoneme_index] = new_dur_2 + last_vec = vec + + with torch.inference_mode(): + energy = energy_calc(input_waves=norm_wave.unsqueeze(0), + input_waves_lengths=norm_wave_length, + feats_lengths=melspec_length, + text=text, + durations=duration.unsqueeze(0), + durations_lengths=torch.LongTensor([len(duration)]))[0].squeeze(0).cpu() + + pitch = parsel(input_waves=norm_wave.unsqueeze(0), + input_waves_lengths=norm_wave_length, + feats_lengths=melspec_length, + text=text, + durations=duration.unsqueeze(0), + durations_lengths=torch.LongTensor([len(duration)]))[0].squeeze(0).cpu() + + return duration, pitch, energy, start_silence, end_silence diff --git a/anonymization/modules/prosody/prosody.py b/anonymization/modules/prosody/prosody.py new file mode 100644 index 0000000..7937d1c --- /dev/null +++ b/anonymization/modules/prosody/prosody.py @@ -0,0 +1,117 @@ +import numpy as np +import torch + + +class Prosody: + + def __init__(self): + self.utterances = {} + self.idx2utt = {} + self.durations = [] + self.pitches = [] + self.energies = [] + + self.start_silences = [] + self.end_silences = [] + + self.new = True + + def __len__(self): + return len(self.utterances) + + def __iter__(self): + if set(self.start_silences) != {None}: + for i in range(len(self)): + yield self.idx2utt[i], self.durations[i], self.pitches[i], self.energies[i], self.start_silences[i], \ + self.end_silences[i] + else: + for i in range(len(self)): + yield self.idx2utt[i], self.durations[i], self.pitches[i], self.energies[i] + + def add_instance(self, utterance, duration, pitch, energy, start_silence=None, end_silence=None): + idx = len(self) + self.utterances[utterance] = idx + self.idx2utt[idx] = utterance + self.durations.append(duration) + self.pitches.append(pitch) + self.energies.append(energy) + self.start_silences.append(start_silence) + self.end_silences.append(end_silence) + + def get_instance(self, utterance): + idx = self.utterances[utterance] + return_dict = { + 'duration': self.durations[idx], + 'pitch': self.pitches[idx], + 'energy': self.energies[idx] + } + + if len(self.start_silences) > 0: + return_dict['start_silence'] = self.start_silences[idx] + return_dict['end_silence'] = self.end_silences[idx] + + return return_dict + + def update_instance(self, utterance, duration, pitch, energy, start_silence=None, end_silence=None): + idx = self.utterances[utterance] + self.durations[idx] = duration + self.pitches[idx] = pitch + self.energies[idx] = energy + self.start_silences[idx] = start_silence + self.end_silences[idx] = end_silence + + def shuffle(self): + shuffled_utterances = {} + shuffled_durations = [] + shuffled_pitches = [] + shuffled_energies = [] + shuffled_start_silences = [] + shuffled_end_silences = [] + + i = 0 + for idx in np.random.permutation(len(self)): + shuffled_utterances[self.idx2utt[idx]] = i + shuffled_durations.append(self.durations[idx]) + shuffled_pitches.append(self.pitches[idx]) + shuffled_energies.append(self.energies[idx]) + shuffled_start_silences.append(self.start_silences[idx]) + shuffled_end_silences.append(self.end_silences[idx]) + i += 1 + + self.utterances = shuffled_utterances + self.durations = shuffled_durations + self.pitches = shuffled_pitches + self.energies = shuffled_energies + self.start_silences = shuffled_start_silences + self.end_silences = shuffled_end_silences + self.idx2utt = {idx: utt for utt, idx in self.utterances.items()} + + def save_prosody(self, out_dir): + out_dir.mkdir(exist_ok=True, parents=True) + + torch.save(self.durations, out_dir / 'duration.pt') + torch.save(self.pitches, out_dir / 'pitch.pt') + torch.save(self.energies, out_dir / 'energy.pt') + torch.save(self.start_silences, out_dir / 'start_silence.pt') + torch.save(self.end_silences, out_dir / 'end_silence.pt') + + with open(out_dir / 'utterances', 'w') as f: + for utt, _ in sorted(self.utterances.items(), key=lambda x: x[1]): + f.write(f'{utt}\n') + + def load_prosody(self, in_dir): + self.new = False + + self.durations = torch.load(in_dir / 'duration.pt', map_location='cpu') + self.pitches = torch.load(in_dir / 'pitch.pt', map_location='cpu') + self.energies = torch.load(in_dir / 'energy.pt', map_location='cpu') + self.start_silences = torch.load(in_dir / 'start_silence.pt', map_location='cpu') + self.end_silences = torch.load(in_dir / 'end_silence.pt', map_location='cpu') + + self.utterances = {} + i = 0 + with open(in_dir / 'utterances', 'r') as f: + for line in f: + self.utterances[line.strip()] = i + i += 1 + self.idx2utt = {idx: utt for utt, idx in self.utterances.items()} diff --git a/anonymization/modules/prosody/prosody_anonymization.py b/anonymization/modules/prosody/prosody_anonymization.py new file mode 100644 index 0000000..7c1aaa4 --- /dev/null +++ b/anonymization/modules/prosody/prosody_anonymization.py @@ -0,0 +1,49 @@ +from pathlib import Path + +from .anonymization import * +from .prosody import Prosody + + +class ProsodyAnonymization: + + def __init__(self, settings, results_dir=None, save_intermediate=True, force_compute=False): + self.save_intermediate = save_intermediate + self.force_compute = force_compute if force_compute else settings.get('force_compute_anonymization', False) + anonymizer_type = settings.get('anonymizer_type', 'ims') + + if results_dir: + self.results_dir = results_dir + elif 'anon_results_path' in settings: + self.results_dir = settings['anon_results_path'] + elif 'results_dir' in settings: + self.results_dir = settings['results_dir'] + else: + if self.save_intermediate: + raise ValueError('Results dir must be specified in parameters or settings!') + + if anonymizer_type == 'ims': + random_offset_lower = settings.get('random_offset_lower', None) + random_offset_higher = settings.get('random_offset_higher', None) + self.anonymization = ImsProsodyAnonymization(random_offset_lower=random_offset_lower, + random_offset_higher=random_offset_higher) + + def anonymize_prosody(self, prosody, dataset_name): + dataset_results_dir = self.results_dir / dataset_name if self.save_intermediate else Path('') + + anon_prosody = Prosody() + + if (dataset_results_dir / 'utterances').exists() and not self.force_compute: + anon_prosody.load_prosody(dataset_results_dir) + unprocessed_utts = prosody.utterances.keys() - anon_prosody.utterances.keys() + else: + unprocessed_utts = prosody.utterances.keys() + + for utt in unprocessed_utts: + prosodic_elements = prosody.get_instance(utt) + anon_prosodic_elements = self.anonymization.anonymize_values(**prosodic_elements) + anon_prosody.add_instance(utt, **anon_prosodic_elements) + + if unprocessed_utts and self.save_intermediate: + anon_prosody.save_prosody(out_dir=dataset_results_dir) + + return anon_prosody \ No newline at end of file diff --git a/anonymization/modules/prosody/prosody_extraction.py b/anonymization/modules/prosody/prosody_extraction.py new file mode 100644 index 0000000..478f574 --- /dev/null +++ b/anonymization/modules/prosody/prosody_extraction.py @@ -0,0 +1,76 @@ +import torch +torch.set_num_threads(1) + +from tqdm import tqdm +from pathlib import Path + +from .prosody import Prosody +from .extraction import * +from utils import read_kaldi_format + + +class ProsodyExtraction: + + def __init__(self, device, settings, results_dir=None, save_intermediate=True, force_compute=False): + self.device = device + self.save_intermediate = save_intermediate + self.force_compute = force_compute if force_compute else settings.get('force_compute_extraction', False) + extractor_type = settings.get('extractor_type', 'ims') + + if results_dir: + self.results_dir = results_dir + elif 'extraction_results_path' in settings: + self.results_dir = settings['extraction_results_path'] + elif 'results_dir' in settings: + self.results_dir = settings['results_dir'] + else: + if self.save_intermediate: + raise ValueError('Results dir must be specified in parameters or settings!') + + if extractor_type == 'ims': + self.aligner_path = settings.get('aligner_model_path') + self.on_line_fine_tune = settings.get('on_line_fine_tune', True) + self.extractor = ImsProsodyExtractor(aligner_path=self.aligner_path, device=self.device, + on_line_fine_tune=self.on_line_fine_tune) + + def extract_prosody(self, dataset_path: Path, texts, dataset_name=None): + dataset_name = dataset_name if dataset_name else dataset_path.name + dataset_results_dir = self.results_dir / dataset_name if self.save_intermediate else Path('') + wav_scp = read_kaldi_format(dataset_path / 'wav.scp') + + data_prosody = Prosody() + text_is_phones = texts.is_phones + + if (dataset_results_dir / 'utterances').exists() and not self.force_compute: + data_prosody.load_prosody(dataset_results_dir) + unprocessed_utts = wav_scp.keys() - data_prosody.utterances.keys() + wav_scp = {utt: wav_scp[utt] for utt in unprocessed_utts} + + if wav_scp: + print(f'Extract prosody for {len(wav_scp)} of {len(wav_scp) + len(data_prosody)} utterances') + data_prosody.new = True + i = 0 + for utt, wav_path in tqdm(wav_scp.items()): + text = texts[utt] + try: + utt_prosody = self.extractor.extract_prosody(transcript=text, ref_audio_path=wav_path, + input_is_phones=text_is_phones) + except IndexError: + print(f'Index Error for {utt}') + continue + duration, pitch, energy, start_silence, end_silence = utt_prosody + data_prosody.add_instance(utterance=utt, duration=duration, pitch=pitch, energy=energy, + start_silence=start_silence, end_silence=end_silence) + i += 1 + if self.save_intermediate and i > 0 and i % 100 == 0: + data_prosody.save_prosody(dataset_results_dir) + + if self.save_intermediate: + data_prosody.save_prosody(dataset_results_dir) + + elif len(data_prosody.utterances) > 0: + print('No prosody extraction necessary; load stored values instead...') + else: + print(f'No utterances could be found in {dataset_path}!') + + return data_prosody diff --git a/anonymization/modules/speaker_embeddings/__init__.py b/anonymization/modules/speaker_embeddings/__init__.py new file mode 100644 index 0000000..9ce3486 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/__init__.py @@ -0,0 +1,3 @@ +from .speaker_extraction import SpeakerExtraction +from .speaker_anonymization import SpeakerAnonymization +from .speaker_embeddings import SpeakerEmbeddings \ No newline at end of file diff --git a/anonymization/modules/speaker_embeddings/anonymization/__init__.py b/anonymization/modules/speaker_embeddings/anonymization/__init__.py new file mode 100644 index 0000000..06d06ed --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/__init__.py @@ -0,0 +1,3 @@ +from .pool_anon import PoolAnonymizer +from .random_anon import RandomAnonymizer +from .gan_anon import GANAnonymizer \ No newline at end of file diff --git a/anonymization/modules/speaker_embeddings/anonymization/base_anon.py b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py new file mode 100644 index 0000000..e9a804a --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/base_anon.py @@ -0,0 +1,22 @@ +from pathlib import Path +import torch + + +class BaseAnonymizer: + + def __init__(self, vec_type='xvector', device=None, **kwargs): + # Base class for speaker embedding anonymization. + self.vec_type = vec_type + + if isinstance(device, torch.device): + self.device = device + elif isinstance(device, str): + self.device = torch.device(device) + elif isinstance(device, int): + self.device = torch.device(f'cuda:{device}') + else: + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): + # Template method for anonymizing a dataset. Not implemented. + raise NotImplementedError('anonymize_data') diff --git a/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py new file mode 100644 index 0000000..cc2b7fd --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/gan_anon.py @@ -0,0 +1,83 @@ +import torch +import numpy as np +from scipy.spatial.distance import cosine +from tqdm import tqdm + +from .base_anon import BaseAnonymizer +from ..speaker_embeddings import SpeakerEmbeddings +from .utils.WGAN import EmbeddingsGenerator + + +class GANAnonymizer(BaseAnonymizer): + + def __init__(self, vec_type='xvector', device=None, model_name=None, vectors_file=None, sim_threshold=0.7, + gan_model_path=None, num_sampled=1000, save_intermediate=False, **kwargs): + super().__init__(vec_type=vec_type, device=device) + + self.model_name = model_name if model_name else f'gan_{vec_type}' + self.vectors_file = vectors_file + self.unused_indices_file = self.vectors_file.with_name(f'unused_indices_{self.vectors_file.name}') + self.sim_threshold = sim_threshold + self.save_intermediate = save_intermediate + self.n = num_sampled + + if self.vectors_file.is_file(): + self.gan_vectors = torch.load(self.vectors_file, map_location=self.device) + if self.unused_indices_file.is_file(): + self.unused_indices = torch.load(self.unused_indices_file, map_location='cpu') + else: + self.unused_indices = np.arange(len(self.gan_vectors)) + else: + self.gan_vectors, self.unused_indices = self._generate_artificial_embeddings(gan_model_path, self.n) + + def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): + if emb_level == 'spk': + print(f'Anonymize embeddings of {len(speaker_embeddings)} speakers...') + elif emb_level == 'utt': + print(f'Anonymize embeddings of {len(speaker_embeddings)} utterances...') + + identifiers = [] + speakers = [] + anon_vectors = [] + genders = [] + for i in tqdm(range(len(speaker_embeddings))): + identifier, vector = speaker_embeddings[i] + speaker = speaker_embeddings.original_speakers[i] + gender = speaker_embeddings.genders[i] + anon_vec = self._select_gan_vector(spk_vec=vector) + identifiers.append(identifier) + speakers.append(speaker) + anon_vectors.append(anon_vec) + genders.append(gender) + + anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, vec_level=emb_level) + anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), + speakers=speakers, genders=genders) + if self.save_intermediate: + torch.save(self.unused_indices, self.unused_indices_file) + + return anon_embeddings + + def _generate_artificial_embeddings(self, gan_model_path, n): + print(f'Generate {n} artificial speaker embeddings...') + generator = EmbeddingsGenerator(gan_path=gan_model_path, device=self.device) + gan_vectors = generator.generate_embeddings(n=n) + unused_indices = np.arange(len(gan_vectors)) + + if self.save_intermediate: + torch.save(gan_vectors, self.vectors_file) + torch.save(unused_indices, self.unused_indices_file) + return gan_vectors, unused_indices + + def _select_gan_vector(self, spk_vec): + i = 0 + limit = 20 + while i < limit: + idx = np.random.choice(self.unused_indices) + anon_vec = self.gan_vectors[idx] + sim = 1 - cosine(spk_vec.cpu().numpy(), anon_vec.cpu().numpy()) + if sim < self.sim_threshold: + break + i += 1 + self.unused_indices = self.unused_indices[self.unused_indices != idx] + return anon_vec diff --git a/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py new file mode 100644 index 0000000..bcfdb7a --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/pool_anon.py @@ -0,0 +1,154 @@ +from pathlib import Path +import numpy as np +import torch +import json +from tqdm import tqdm +from sklearn.metrics.pairwise import cosine_distances +from sklearn.preprocessing import minmax_scale, StandardScaler + +from .base_anon import BaseAnonymizer +from .utils.plda_model import PLDAModel +from ..speaker_extraction import SpeakerExtraction +from ..speaker_embeddings import SpeakerEmbeddings +from utils import transform_path + +REVERSED_GENDERS = {'m': 'f', 'f': 'm'} + + +class PoolAnonymizer(BaseAnonymizer): + + def __init__(self, vec_type='xvector', device=None, model_name=None, pool_data_dir='data/libritts_train_other_500', + pool_vec_path='original_speaker_embeddings/pool_embeddings', N=200, N_star=100, distance='plda', + cross_gender=False, proximity='farthest', scaling=None, stats_per_dim_path=None, + distance_model_path='distances/plda/libritts_train_other_500_xvector', + embed_model_dir=None, save_intermediate=False, **kwargs): + # Pool anonymization method based on the primary baseline of the Voice Privacy Challenge 2020. + # Given a speaker vector, the N most distant vectors in an external speaker pool are extracted, + # and an average of a random subset of N_star vectors is computed and taken as new speaker vector. + # Default distance measure is PLDA. + super().__init__(vec_type=vec_type, device=device) + + self.model_name = model_name if model_name else f'pool_{vec_type}' + + self.N = N # number of most distant vectors to consider + self.N_star = N_star # number of vectors to include in averaged vector + self.proximity = proximity # proximity method, either 'farthest' (distant vectors), 'nearest', or 'closest' + self.cross_gender = cross_gender # Whether to reverse the genders of the speakers + self.save_intermediate = save_intermediate + + # external speaker pool + self.pool_embeddings = self._load_pool_embeddings(pool_data_dir=Path(pool_data_dir), + pool_vec_path=Path(pool_vec_path), + embed_model_dir=Path(embed_model_dir)) + self.pool_genders = {gender: [i for i, spk_gender in enumerate(self.pool_embeddings.genders) + if spk_gender == gender] for gender in set(self.pool_embeddings.genders)} + + # distance model; PLDA model if distance == plda; None if distance == cosine + self.distance = distance # distance measure, either 'plda' or 'cosine' + if self.distance == 'plda': + self.distance_model = PLDAModel(train_embeddings=self.pool_embeddings, + results_path=Path(distance_model_path), save_plda=self.save_intermediate) + else: + self.distance_model = None + + # scaling to ensure correct value ranges per dimension + self.scaling = scaling + self.stats_per_dim_path = stats_per_dim_path or Path() + + def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_dir): + print(pool_data_dir) + if pool_vec_path.exists(): + pool_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level='spk', device=self.device) + pool_embeddings.load_vectors(pool_vec_path) + else: + extraction_settings = {'vec_type': self.vec_type, 'emb_level': 'spk'} + emb_extractor = SpeakerExtraction(results_dir=pool_vec_path, model_dir=embed_model_dir, device=self.device, + settings=extraction_settings, save_intermediate=self.save_intermediate) + pool_embeddings = emb_extractor.extract_speakers(dataset_path=pool_data_dir, dataset_name='') + return pool_embeddings + + def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): + distance_matrix = self._compute_distances(vectors_a=self.pool_embeddings.vectors, + vectors_b=speaker_embeddings.vectors) + + print(f'Anonymize embeddings of {len(speaker_embeddings)} speakers...') + identifiers = [] + speakers = [] + anon_vectors = [] + genders = [] + + for i in tqdm(range(len(speaker_embeddings))): + identifier, _ = speaker_embeddings[i] + speaker = speaker_embeddings.original_speakers[i] + gender = speaker_embeddings.genders[i] + distances_to_speaker = distance_matrix[:, i] + candidates = self._get_pool_candidates(distances_to_speaker, gender) + selected_anon_pool = np.random.choice(candidates, self.N_star, replace=False) + anon_vec = torch.mean(self.pool_embeddings.speaker_vectors[selected_anon_pool], dim=0) + identifiers.append(identifier) + speakers.append(speaker) + anon_vectors.append(anon_vec) + genders.append(gender if not self.cross_gender else REVERSED_GENDERS[gender]) + + anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device, vec_level=emb_level) + anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), + speakers=speakers, genders=genders) + + return anon_embeddings + + def _compute_distances(self, vectors_a, vectors_b): + if self.distance == 'plda': + return 1 - self.distance_model.compute_distance(enrollment_vectors=vectors_a, trial_vectors=vectors_b) + elif self.distance == 'cosine': + return cosine_distances(X=vectors_a.cpu(), Y=vectors_b.cpu()) + else: + return [] + + def _get_pool_candidates(self, distances, gender): + if self.cross_gender is True: + distances = distances[self.pool_genders[REVERSED_GENDERS[gender]]] + else: + distances = distances[self.pool_genders[gender]] + + if self.proximity == 'farthest': + return np.argpartition(distances, -self.N)[-self.N:] + elif self.proximity == 'nearest': + return np.argpartition(distances, self.N)[:self.N] + elif self.proximity == 'center': + sorted_distances = np.sort(distances) + return sorted_distances[len(sorted_distances)//2:(len(sorted_distances)//2)+self.N] + + def _load_scaling_ranges(self, stats_per_dim_path): + if stats_per_dim_path and Path(stats_per_dim_path).exists(): + with open(stats_per_dim_path) as f: + dim_ranges = json.load(f) + return [(v['min'], v['max']) for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0]))] + else: + raise FileNotFoundError(f'You need to specify a path to an existing file containing the statistics for ' + f'each dimension in the given embedding type, ' + f'stats_per_dim_path={stats_per_dim_path} is not valid!') + + def _scale_embeddings(self, embeddings): + vectors = embeddings.vectors.cpu().numpy() + + if self.scaling == 'minmax': + scaling_ranges = self._load_scaling_ranges(self.stats_per_dim_path) + scaled_dims = [] + for i in range(len(scaling_ranges)): + scaled_dims.append(minmax_scale(vectors[:, i], scaling_ranges[i], axis=0)) + + scaled_vectors = torch.tensor(np.array(scaled_dims)).T.to(self.device) + embeddings.vectors = scaled_vectors + + elif self.scaling == 'std': + std_scaler = StandardScaler() + std_scaler.fit(self.pool_embeddings.vectors.cpu().numpy()) + scaled_vectors = torch.tensor(std_scaler.transform(vectors)) + embeddings.vectors = scaled_vectors + + return embeddings + +# for every source x-vector, an anonymized x-vector is computed by finding the N farthest x- +# vectors in an external pool (LibriTTS train-other-500) accord- +# ing to the PLDA distance, and by averaging N ∗ randomly se- +# lected vectors among them. In the baseline, we use N = 200 and N ∗ = 100 \ No newline at end of file diff --git a/anonymization/modules/speaker_embeddings/anonymization/random_anon.py b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py new file mode 100644 index 0000000..6a0c059 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/random_anon.py @@ -0,0 +1,68 @@ +import json +from pathlib import Path +import torch +import numpy as np + +from .base_anon import BaseAnonymizer +from ..speaker_embeddings import SpeakerEmbeddings + + +class RandomAnonymizer(BaseAnonymizer): + + def __init__(self, vec_type='xvector', device=None, model_name=None, in_scale=False, stats_per_dim_path=None, + **kwargs): + super().__init__(vec_type=vec_type, device=device) + + self.model_name = model_name if model_name else f'random_{vec_type}' + + if in_scale: + self.scaling_ranges = self._load_scaling_ranges(stats_per_dim_path=stats_per_dim_path) + else: + self.scaling_ranges = None + + def anonymize_embeddings(self, speaker_embeddings, emb_level='spk'): + if self.scaling_ranges: + print('Anonymize vectors in scale!') + return self._anonymize_data_in_scale(speaker_embeddings) + else: + identifiers = [] + anon_vectors = [] + speakers = speaker_embeddings.original_speakers + genders = speaker_embeddings.genders + for identifier, vector in speaker_embeddings: + mask = torch.zeros(vector.shape[0]).float().random_(-40, 40).to(self.device) + anon_vec = vector * mask + identifiers.append(identifier) + anon_vectors.append(anon_vec) + + anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device) + anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), + genders=genders, speakers=speakers) + + return anon_embeddings + + def _load_scaling_ranges(self, stats_per_dim_path): + if stats_per_dim_path is None: + stats_per_dim_path = Path('stats_per_dim.json') + + with open(stats_per_dim_path) as f: + dim_ranges = json.load(f) + return [(v['min'], v['max']) for k, v in sorted(dim_ranges.items(), key=lambda x: int(x[0]))] + + def _anonymize_data_in_scale(self, speaker_embeddings): + identifiers = [] + anon_vectors = [] + speakers = speaker_embeddings.original_speakers + genders = speaker_embeddings.genders + + for identifier, vector in speaker_embeddings: + anon_vec = torch.tensor([np.random.uniform(*dim_range) + for dim_range in self.scaling_ranges]).to(self.device) + identifiers.append(identifier) + anon_vectors.append(anon_vec) + + anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, device=self.device) + anon_embeddings.set_vectors(identifiers=identifiers, vectors=torch.stack(anon_vectors, dim=0), genders=genders, + speakers=speakers) + + return anon_embeddings \ No newline at end of file diff --git a/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/__init__.py b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/__init__.py new file mode 100644 index 0000000..d14613c --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/__init__.py @@ -0,0 +1 @@ +from .embeddings_generator import EmbeddingsGenerator \ No newline at end of file diff --git a/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/embeddings_generator.py b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/embeddings_generator.py new file mode 100644 index 0000000..2d1ed7d --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/embeddings_generator.py @@ -0,0 +1,34 @@ +import torch + +from .init_wgan import create_wgan + + +class EmbeddingsGenerator: + + def __init__(self, gan_path, device): + self.device = device + self.gan_path = gan_path + + self.mean = None + self.std = None + self.wgan = None + + self._load_model(self.gan_path) + + def generate_embeddings(self, n=1000): + generated_samples = self.wgan.sample_generator(num_samples=n, nograd=True, return_intermediate=False).cpu() + return self._inverse_normalize(generated_samples) + + + def _load_model(self, path): + gan_checkpoint = torch.load(path, map_location="cpu") + + self.wgan = create_wgan(parameters=gan_checkpoint['model_parameters'], device=self.device) + self.wgan.G.load_state_dict(gan_checkpoint['generator_state_dict']) + self.wgan.D.load_state_dict(gan_checkpoint['critic_state_dict']) + + self.mean = gan_checkpoint['mean'] + self.std = gan_checkpoint['std'] + + def _inverse_normalize(self, tensor): + return tensor * self.std + self.mean diff --git a/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/init_wgan.py b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/init_wgan.py new file mode 100644 index 0000000..25bc81b --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/init_wgan.py @@ -0,0 +1,65 @@ +import torch +import torch.nn as nn + +from .wgan_qc import WassersteinGanQuadraticCost +from .resnet_1 import ResNet_D, ResNet_G + + +def create_wgan(parameters, device, optimizer='adam'): + if parameters['model'] == 'resnet': + generator, discriminator = init_resnet(parameters) + else: + raise NotImplementedError + + if optimizer == 'adam': + optimizer_g = torch.optim.Adam(generator.parameters(), lr=parameters['learning_rate'], betas=parameters['betas']) + optimizer_d = torch.optim.Adam(discriminator.parameters(), lr=parameters['learning_rate'], betas=parameters['betas']) + elif optimizer == 'rmsprop': + optimizer_g = torch.optim.RMSprop(generator.parameters(), lr=parameters['learning_rate']) + optimizer_d = torch.optim.RMSprop(generator.parameters(), lr=parameters['learning_rate']) + + criterion = torch.nn.MSELoss() + + gan = WassersteinGanQuadraticCost(generator, + discriminator, + optimizer_g, + optimizer_d, + criterion=criterion, + data_dimensions=parameters['data_dim'], + epochs=parameters['epochs'], + batch_size=parameters['batch_size'], + device=device, + n_max_iterations=parameters['n_max_iterations'], + gamma=parameters['gamma']) + + return gan + + +def init_resnet(parameters): + critic = ResNet_D(parameters['data_dim'][-1], parameters['size'], nfilter=parameters['nfilter'], + nfilter_max=parameters['nfilter_max']) + generator = ResNet_G(parameters['data_dim'][-1], parameters['z_dim'], parameters['size'], + nfilter=parameters['nfilter'], nfilter_max=parameters['nfilter_max']) + + generator.apply(weights_init_G) + critic.apply(weights_init_D) + + return generator, critic + + +def weights_init_D(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu') + elif classname.find('BatchNorm') != -1: + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + +def weights_init_G(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu') + elif classname.find('BatchNorm') != -1: + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) diff --git a/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/resnet_1.py b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/resnet_1.py new file mode 100644 index 0000000..ef5eff8 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/resnet_1.py @@ -0,0 +1,175 @@ +import numpy as np +import torch +import torch.utils.data +import torch.utils.data.distributed +from torch import nn + + +class ResNet_G(nn.Module): + + def __init__(self, data_dim, z_dim, size, nfilter=64, nfilter_max=512, bn=True, res_ratio=0.1, **kwargs): + super().__init__() + self.input_dim = z_dim + self.output_dim = z_dim + self.dropout_rate = 0 + + s0 = self.s0 = 4 + nf = self.nf = nfilter + nf_max = self.nf_max = nfilter_max + self.bn = bn + self.z_dim = z_dim + + # Submodules + nlayers = int(np.log2(size / s0)) + self.nf0 = min(nf_max, nf * 2 ** (nlayers + 1)) + + self.fc = nn.Linear(z_dim, self.nf0 * s0 * s0) + if self.bn: + self.bn1d = nn.BatchNorm1d(self.nf0 * s0 * s0) + self.relu = nn.LeakyReLU(0.2, inplace=True) + + blocks = [] + for i in range(nlayers, 0, -1): + nf0 = min(nf * 2 ** (i + 1), nf_max) + nf1 = min(nf * 2 ** i, nf_max) + blocks += [ + ResNetBlock(nf0, nf1, bn=self.bn, res_ratio=res_ratio), + nn.Upsample(scale_factor=2) + ] + + nf0 = min(nf * 2, nf_max) + nf1 = min(nf, nf_max) + blocks += [ + ResNetBlock(nf0, nf1, bn=self.bn, res_ratio=res_ratio), + ResNetBlock(nf1, nf1, bn=self.bn, res_ratio=res_ratio) + ] + + self.resnet = nn.Sequential(*blocks) + self.conv_img = nn.Conv2d(nf, 3, 3, padding=1) + + self.fc_out = nn.Linear(3 * size * size, data_dim) + + def forward(self, z, return_intermediate=False): + batch_size = z.size(0) + out = self.fc(z) + if self.bn: + out = self.bn1d(out) + out = self.relu(out) + if return_intermediate: + l_1 = out.detach().clone() + out = out.view(batch_size, self.nf0, self.s0, self.s0) + + out = self.resnet(out) + + out = self.conv_img(out) + out = self.relu(out) + out.flatten(1) + out = self.fc_out(out.flatten(1)) + + if return_intermediate: + return out, l_1 + return out + + def sample_latent(self, n_samples, z_size): + return torch.randn((n_samples, z_size)) + + +class ResNet_D(nn.Module): + + def __init__(self, data_dim, size, nfilter=64, nfilter_max=512, res_ratio=0.1): + super().__init__() + s0 = self.s0 = 4 + nf = self.nf = nfilter + nf_max = self.nf_max = nfilter_max + self.size = size + + # Submodules + nlayers = int(np.log2(size / s0)) + self.nf0 = min(nf_max, nf * 2 ** nlayers) + + nf0 = min(nf, nf_max) + nf1 = min(nf * 2, nf_max) + blocks = [ + ResNetBlock(nf0, nf0, bn=False, res_ratio=res_ratio), + ResNetBlock(nf0, nf1, bn=False, res_ratio=res_ratio) + ] + + self.fc_input = nn.Linear(data_dim, 3 * size * size) + + for i in range(1, nlayers + 1): + nf0 = min(nf * 2 ** i, nf_max) + nf1 = min(nf * 2 ** (i + 1), nf_max) + blocks += [ + nn.AvgPool2d(3, stride=2, padding=1), + ResNetBlock(nf0, nf1, bn=False, res_ratio=res_ratio), + ] + + self.conv_img = nn.Conv2d(3, 1 * nf, 3, padding=1) + self.relu = nn.LeakyReLU(0.2, inplace=True) + self.resnet = nn.Sequential(*blocks) + + self.fc = nn.Linear(self.nf0 * s0 * s0, 1) + + def forward(self, x): + batch_size = x.size(0) + + out = self.fc_input(x) + out = self.relu(out).view(batch_size, 3, self.size, self.size) + + out = self.relu((self.conv_img(out))) + out = self.resnet(out) + out = out.view(batch_size, self.nf0 * self.s0 * self.s0) + out = self.fc(out) + + return out + + +class ResNetBlock(nn.Module): + + def __init__(self, fin, fout, fhidden=None, bn=True, res_ratio=0.1): + super().__init__() + # Attributes + self.bn = bn + self.is_bias = not bn + self.learned_shortcut = (fin != fout) + self.fin = fin + self.fout = fout + if fhidden is None: + self.fhidden = min(fin, fout) + else: + self.fhidden = fhidden + self.res_ratio = res_ratio + + # Submodules + self.conv_0 = nn.Conv2d(self.fin, self.fhidden, 3, stride=1, padding=1, bias=self.is_bias) + if self.bn: + self.bn2d_0 = nn.BatchNorm2d(self.fhidden) + self.conv_1 = nn.Conv2d(self.fhidden, self.fout, 3, stride=1, padding=1, bias=self.is_bias) + if self.bn: + self.bn2d_1 = nn.BatchNorm2d(self.fout) + if self.learned_shortcut: + self.conv_s = nn.Conv2d(self.fin, self.fout, 1, stride=1, padding=0, bias=False) + if self.bn: + self.bn2d_s = nn.BatchNorm2d(self.fout) + self.relu = nn.LeakyReLU(0.2, inplace=True) + + def forward(self, x): + x_s = self._shortcut(x) + dx = self.conv_0(x) + if self.bn: + dx = self.bn2d_0(dx) + dx = self.relu(dx) + dx = self.conv_1(dx) + if self.bn: + dx = self.bn2d_1(dx) + out = self.relu(x_s + self.res_ratio * dx) + return out + + def _shortcut(self, x): + if self.learned_shortcut: + x_s = self.conv_s(x) + if self.bn: + x_s = self.bn2d_s(x_s) + else: + x_s = x + return x_s diff --git a/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/wgan_qc.py b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/wgan_qc.py new file mode 100644 index 0000000..ae4ce33 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/utils/WGAN/wgan_qc.py @@ -0,0 +1,276 @@ +import os +import time + +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from cvxopt import matrix +from cvxopt import solvers +from cvxopt import sparse +from cvxopt import spmatrix +from torch.autograd import grad as torch_grad +from tqdm import tqdm + + +class WassersteinGanQuadraticCost: + + def __init__(self, generator, discriminator, gen_optimizer, dis_optimizer, criterion, epochs, n_max_iterations, + data_dimensions, batch_size, device, gamma=0.1, K=-1, milestones=[150000, 250000], lr_anneal=1.0, + device_ids=None): + self.G = generator + self.G_opt = gen_optimizer + self.D = discriminator + self.D_opt = dis_optimizer + self.losses = { + 'D' : [], + 'WD': [], + 'G' : [] + } + self.num_steps = 0 + self.gen_steps = 0 + self.epochs = epochs + self.n_max_iterations = n_max_iterations + # put in the shape of a dataset sample + self.data_dim = data_dimensions[0] * data_dimensions[1] * data_dimensions[2] + self.batch_size = batch_size + self.device = device + self.criterion = criterion + self.mone = torch.FloatTensor([-1]).to(device) + self.tensorboard_counter = 0 + + if K <= 0: + self.K = 1 / self.data_dim + else: + self.K = K + self.Kr = np.sqrt(self.K) + self.LAMBDA = 2 * self.Kr * gamma * 2 + + if device_ids is None: + device_ids = [self.device.index] + + self.G = nn.DataParallel(self.G.to(self.device), device_ids=device_ids) + self.D = nn.DataParallel(self.D.to(self.device), device_ids=device_ids) + + self.schedulerD = self._build_lr_scheduler_(self.D_opt, milestones, lr_anneal) + self.schedulerG = self._build_lr_scheduler_(self.G_opt, milestones, lr_anneal) + + self.c, self.A, self.pStart = self._prepare_linear_programming_solver_(self.batch_size) + + def _build_lr_scheduler_(self, optimizer, milestones, lr_anneal, last_epoch=-1): + scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones, gamma=lr_anneal, last_epoch=-1) + return scheduler + + def _quadratic_wasserstein_distance_(self, real, generated): + num_r = real.size(0) + num_f = generated.size(0) + real_flat = real.view(num_r, -1) + fake_flat = generated.view(num_f, -1) + + real3D = real_flat.unsqueeze(1).expand(num_r, num_f, self.data_dim) + fake3D = fake_flat.unsqueeze(0).expand(num_r, num_f, self.data_dim) + # compute squared L2 distance + dif = real3D - fake3D + dist = 0.5 * dif.pow(2).sum(2).squeeze() + + return self.K * dist + + def _prepare_linear_programming_solver_(self, batch_size): + A = spmatrix(1.0, range(batch_size), [0] * batch_size, (batch_size, batch_size)) + for i in range(1, batch_size): + Ai = spmatrix(1.0, range(batch_size), [i] * batch_size, (batch_size, batch_size)) + A = sparse([A, Ai]) + + D = spmatrix(-1.0, range(batch_size), range(batch_size), (batch_size, batch_size)) + DM = D + for i in range(1, batch_size): + DM = sparse([DM, D]) + + A = sparse([[A], [DM]]) + + cr = matrix([-1.0 / batch_size] * batch_size) + cf = matrix([1.0 / batch_size] * batch_size) + c = matrix([cr, cf]) + + pStart = {} + pStart['x'] = matrix([matrix([1.0] * batch_size), matrix([-1.0] * batch_size)]) + pStart['s'] = matrix([1.0] * (2 * batch_size)) + + return c, A, pStart + + def _linear_programming_(self, distance, batch_size): + b = matrix(distance.cpu().double().detach().numpy().flatten()) + sol = solvers.lp(self.c, self.A, b, primalstart=self.pStart, solver='glpk', + options={'glpk': {'msg_lev': 'GLP_MSG_OFF'}}) + offset = 0.5 * (sum(sol['x'])) / batch_size + sol['x'] = sol['x'] - offset + self.pStart['x'] = sol['x'] + self.pStart['s'] = sol['s'] + + return sol + + def _approx_OT_(self, sol): + # Compute the OT mapping for each fake dataset + ResMat = np.array(sol['z']).reshape((self.batch_size, self.batch_size)) + mapping = torch.from_numpy(np.argmax(ResMat, axis=0)).long().to(self.device) + + return mapping + + def _optimal_transport_regularization_(self, output_fake, fake, real_fake_diff): + output_fake_grad = torch.ones(output_fake.size()).to(self.device) + gradients = torch_grad(outputs=output_fake, inputs=fake, + grad_outputs=output_fake_grad, + create_graph=True, retain_graph=True, only_inputs=True)[0] + n = gradients.size(0) + RegLoss = 0.5 * ((gradients.view(n, -1).norm(dim=1) / (2 * self.Kr) - self.Kr / 2 * real_fake_diff.view(n, + -1).norm( + dim=1)).pow(2)).mean() + fake.requires_grad = False + + return RegLoss + + def _critic_deep_regression_(self, images, opt_iterations=1): + images = images.to(self.device) + + for p in self.D.parameters(): # reset requires_grad + p.requires_grad = True # they are set to False below in netG update + + self.G.train() + self.D.train() + + # Get generated fake dataset + generated_data = self.sample_generator(self.batch_size) + + # compute wasserstein distance + distance = self._quadratic_wasserstein_distance_(images, generated_data) + # solve linear programming problem + sol = self._linear_programming_(distance, self.batch_size) + # approximate optimal transport + mapping = self._approx_OT_(sol) + real_ordered = images[mapping] # match real and fake + real_fake_diff = real_ordered - generated_data + + # construct target + target = torch.from_numpy(np.array(sol['x'])).float() + target = target.squeeze().to(self.device) + + for i in range(opt_iterations): + self.D.zero_grad() # ??? + self.D_opt.zero_grad() + generated_data.requires_grad_() + if generated_data.grad is not None: + generated_data.grad.data.zero_() + output_real = self.D(images) + output_fake = self.D(generated_data) + output_real, output_fake = output_real.squeeze(), output_fake.squeeze() + output_R_mean = output_real.mean(0).view(1) + output_F_mean = output_fake.mean(0).view(1) + + L2LossD_real = self.criterion(output_R_mean[0], target[:self.batch_size].mean()) + L2LossD_fake = self.criterion(output_fake, target[self.batch_size:]) + L2LossD = 0.5 * L2LossD_real + 0.5 * L2LossD_fake + + reg_loss_D = self._optimal_transport_regularization_(output_fake, generated_data, real_fake_diff) + + total_loss = L2LossD + self.LAMBDA * reg_loss_D + + self.losses['D'].append(float(total_loss.data)) + + total_loss.backward() + self.D_opt.step() + + # this is supposed to be the wasserstein distance + wasserstein_distance = output_R_mean - output_F_mean + self.losses['WD'].append(float(wasserstein_distance.data)) + + def _generator_train_iteration(self, batch_size): + for p in self.D.parameters(): + p.requires_grad = False # freeze critic + + self.G.zero_grad() + self.G_opt.zero_grad() + + if isinstance(self.G, torch.nn.parallel.DataParallel): + z = self.G.module.sample_latent(batch_size, self.G.module.z_dim) + else: + z = self.G.sample_latent(batch_size, self.G.z_dim) + z.requires_grad = True + + fake = self.G(z) + output_fake = self.D(fake) + output_F_mean_after = output_fake.mean(0).view(1) + + self.losses['G'].append(float(output_F_mean_after.data)) + + output_F_mean_after.backward(self.mone) + self.G_opt.step() + + self.schedulerD.step() + self.schedulerG.step() + + def _train_epoch(self, data_loader, writer, experiment): + for i, data in enumerate(tqdm(data_loader)): + images = data[0] + speaker_ids = data[1] + self.num_steps += 1 + # self.tensorboard_counter += 1 + if self.gen_steps >= self.n_max_iterations: + return + self._critic_deep_regression_(images) + self._generator_train_iteration(images.size(0)) + + D_loss_avg = np.average(self.losses['D']) + G_loss_avg = np.average(self.losses['G']) + wd_avg = np.average(self.losses['WD']) + + def train(self, data_loader, writer, experiment=None): + self.G.train() + self.D.train() + + for epoch in range(self.epochs): + if self.gen_steps >= self.n_max_iterations: + return + time_start_epoch = time.time() + self._train_epoch(data_loader, writer, experiment) + + D_loss_avg = np.average(self.losses['D']) + + time_end_epoch = time.time() + + return self + + def sample_generator(self, num_samples, nograd=False, return_intermediate=False): + self.G.eval() + if isinstance(self.G, torch.nn.parallel.DataParallel): + latent_samples = self.G.module.sample_latent(num_samples, self.G.module.z_dim) + else: + latent_samples = self.G.sample_latent(num_samples, self.G.z_dim) + latent_samples = latent_samples.to(self.device) + if nograd: + with torch.no_grad(): + generated_data = self.G(latent_samples, return_intermediate=return_intermediate) + else: + generated_data = self.G(latent_samples) + self.G.train() + if return_intermediate: + return generated_data[0].detach(), generated_data[1], latent_samples + return generated_data.detach() + + def sample(self, num_samples): + generated_data = self.sample_generator(num_samples) + # Remove color channel + return generated_data.data.cpu().numpy()[:, 0, :, :] + + def save_model_checkpoint(self, model_path, model_parameters, timestampStr): + # dateTimeObj = datetime.now() + # timestampStr = dateTimeObj.strftime("%d-%m-%Y-%H-%M-%S") + name = '%s_%s' % (timestampStr, 'wgan') + model_filename = os.path.join(model_path, name) + torch.save({ + 'generator_state_dict' : self.G.state_dict(), + 'critic_state_dict' : self.D.state_dict(), + 'gen_optimizer_state_dict' : self.G_opt.state_dict(), + 'critic_optimizer_state_dict': self.D_opt.state_dict(), + 'model_parameters' : model_parameters, + 'iterations' : self.num_steps + }, model_filename) diff --git a/anonymization/modules/speaker_embeddings/anonymization/utils/__init__.py b/anonymization/modules/speaker_embeddings/anonymization/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py b/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py new file mode 100644 index 0000000..3abf1fa --- /dev/null +++ b/anonymization/modules/speaker_embeddings/anonymization/utils/plda_model.py @@ -0,0 +1,87 @@ +# This code is based on the descriptions in https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/PLDA_LDA.py +from pathlib import Path +from speechbrain.processing.PLDA_LDA import PLDA, StatObject_SB, Ndx, fast_PLDA_scoring +import numpy as np +import torch + +class PLDAModel: + + def __init__(self, train_embeddings, results_path: Path=None, save_plda=True): + self.mean, self.F, self.Sigma = None, None, None + + files_exist = False + if results_path and results_path.exists(): + files_exist = self.load_parameters(results_path) + if not files_exist: + self._train_plda(train_embeddings) + if results_path and save_plda: + self.save_parameters(results_path) + + def compute_distance(self, enrollment_vectors, enrollment_ids, trial_vectors, trial_ids, return_object=False): + enrol_vecs = enrollment_vectors.cpu().numpy() + en_sets, en_s, en_stat0 = self._get_vector_stats(enrol_vecs, sg_tag='en', utt_ids=enrollment_ids) + en_stat = StatObject_SB(modelset=en_sets, segset=en_sets, start=en_s, stop=en_s, stat0=en_stat0, + stat1=enrol_vecs) + + trial_vecs = trial_vectors.cpu().numpy() + te_sets, te_s, te_stat0 = self._get_vector_stats(trial_vecs, sg_tag='te', utt_ids=trial_ids) + te_stat = StatObject_SB(modelset=te_sets, segset=te_sets, start=te_s, stop=te_s, stat0=te_stat0, + stat1=trial_vecs) + + ndx = Ndx(models=en_sets, testsegs=te_sets) + scores_plda = fast_PLDA_scoring(en_stat, te_stat, ndx, self.mean, self.F, self.Sigma) + if return_object: + return scores_plda + else: + return scores_plda.scoremat + + def save_parameters(self, filename): + filename.mkdir(parents=True, exist_ok=True) + np.save(filename / 'plda_mean.npy', self.mean) + np.save(filename / 'plda_F.npy', self.F) + np.save(filename / 'plda_Sigma.npy', self.Sigma) + + def load_parameters(self, dir_path): + existing_files = [x.name for x in dir_path.glob('*')] + files_exist = True + if 'plda_mean.npy' in existing_files: + self.mean = np.load(dir_path / 'plda_mean.npy') + else: + files_exist = False + + if 'plda_F.npy' in existing_files: + self.F = np.load(dir_path / 'plda_F.npy') + else: + files_exist = False + + if 'plda_Sigma.npy' in existing_files: + self.Sigma = np.load(dir_path / 'plda_Sigma.npy') + else: + files_exist = False + return files_exist + + def _train_plda(self, train_embeddings): + vectors = train_embeddings.vectors.to(torch.float64) + + modelset = np.array([f'md{speaker}' for speaker in train_embeddings.original_speakers], dtype="|O") + print(len(modelset), len(set(modelset))) + segset, s, stat0 = self._get_vector_stats(vectors, sg_tag='sg', utt_ids=train_embeddings.get_utt_list()) + + xvectors_stat = StatObject_SB(modelset=modelset, segset=segset, start=s, stop=s, stat0=stat0, + stat1=vectors.cpu().numpy()) + + print(vectors.shape) + + plda = PLDA(rank_f=100) + plda.plda(xvectors_stat) + + self.mean = plda.mean + self.F = plda.F + self.Sigma = plda.Sigma + + def _get_vector_stats(self, vectors, utt_ids, sg_tag='sg'): + N, dim = vectors.shape + segset = np.array([f'{utt_id}' for utt_id in utt_ids], dtype="|O") + s = np.array([None] * N) + stat0 = np.array([[1.0]] * N) + return segset, s, stat0 \ No newline at end of file diff --git a/anonymization/modules/speaker_embeddings/extraction/__init__.py b/anonymization/modules/speaker_embeddings/extraction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/anonymization/modules/speaker_embeddings/extraction/embedding_methods/__init__.py b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/__init__.py new file mode 100644 index 0000000..8e04392 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/__init__.py @@ -0,0 +1,2 @@ +from .speechbrain_vectors import SpeechBrainVectors +from .style_embeddings import StyleEmbeddings \ No newline at end of file diff --git a/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py new file mode 100644 index 0000000..83fd2fd --- /dev/null +++ b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/speechbrain_vectors.py @@ -0,0 +1,36 @@ +from pathlib import Path +import numpy as np +import torch +from speechbrain.pretrained import EncoderClassifier + + +class SpeechBrainVectors: + + VEC_PATHS = { + 'xvector': 'spkrec-xvect-voxceleb', + 'ecapa': 'spkrec-ecapa-voxceleb' + } + + def __init__(self, vec_type, device, model_path=None): + self.device = device + + if model_path is not None and model_path.exists(): + model_path = str(Path(model_path).absolute()) + self.extractor = EncoderClassifier.from_hparams(source=model_path, savedir=model_path, + run_opts={'device': self.device}) + else: + if model_path is None: + model_path = Path('') + vec_path = self.VEC_PATHS[vec_type] + # The following line downloads and loads the corresponding speaker embedding model from huggingface and store + # it in the corresponding savedir. If a model has been previously downloaded and stored already, + # it is loaded from savedir instead of downloading it again. + self.extractor = EncoderClassifier.from_hparams(source=f'speechbrain/{vec_path}', + savedir=Path(model_path, vec_path), + run_opts={'device': self.device}) + + def extract_vector(self, audio, sr): + audio = torch.tensor(np.trim_zeros(audio.cpu().numpy())) + if len(audio.shape) == 1: + audio = audio.unsqueeze(0) + return self.extractor.encode_batch(wavs=audio).squeeze() diff --git a/anonymization/modules/speaker_embeddings/extraction/embedding_methods/style_embeddings.py b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/style_embeddings.py new file mode 100644 index 0000000..e3285d9 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/extraction/embedding_methods/style_embeddings.py @@ -0,0 +1,31 @@ +import warnings +import torch +from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Spectrogram_to_Embedding.StyleEmbedding import StyleEmbedding +from anonymization.modules.tts.IMSToucan.Preprocessing.AudioPreprocessor import AudioPreprocessor + + +class StyleEmbeddings: + + def __init__(self, model_path, device): + self.device = device + + self.extractor = StyleEmbedding() + check_dict = torch.load(model_path, map_location='cpu') + self.extractor.load_state_dict(check_dict['style_emb_func']) + self.extractor.to(self.device) + + self.audio_preprocessor = AudioPreprocessor(input_sr=16000, output_sr=16000, cut_silence=True, + device=self.device) + + def extract_vector(self, audio, sr): + if sr != self.audio_preprocessor.sr: + self.audio_preprocessor = AudioPreprocessor(input_sr=sr, output_sr=16000, cut_silence=True, + device=self.device) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + audio = self.audio_preprocessor.cut_silence_from_audio(audio.to(self.device)).cpu() + spec = self.audio_preprocessor.logmelfilterbank(audio, 16000).transpose(0, 1) + spec_len = torch.LongTensor([len(spec)]) + vector = self.extractor(spec.unsqueeze(0).to(self.device), spec_len.unsqueeze(0).to(self.device)) + return vector.squeeze().detach() diff --git a/anonymization/modules/speaker_embeddings/extraction/ims_speaker_extraction_methods.py b/anonymization/modules/speaker_embeddings/extraction/ims_speaker_extraction_methods.py new file mode 100644 index 0000000..0264592 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/extraction/ims_speaker_extraction_methods.py @@ -0,0 +1,27 @@ +import numpy as np +import torch +import torchaudio +import pyloudnorm as pyln + + +def normalize_wave(wave, sr, device): + # adapted from IMSToucan/Preprocessing/AudioPreprocessor + dur = wave.shape[1] / sr + wave = wave.squeeze().cpu().numpy() + + # normalize loudness + try: + meter = pyln.Meter(sr, block_size=min(dur - 0.0001, abs(dur - 0.1)) if dur < 0.4 else 0.4) + loudness = meter.integrated_loudness(wave) + loud_normed = pyln.normalize.loudness(wave, loudness, -30.0) + peak = np.amax(np.abs(loud_normed)) + norm_wave = np.divide(loud_normed, peak) + except ZeroDivisionError: + norm_wave = wave + + wave = torch.Tensor(norm_wave).to(device) + + if sr != 16000: + wave = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000).to(device)(wave) + + return wave diff --git a/anonymization/modules/speaker_embeddings/speaker_anonymization.py b/anonymization/modules/speaker_embeddings/speaker_anonymization.py new file mode 100644 index 0000000..4374236 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/speaker_anonymization.py @@ -0,0 +1,74 @@ +from pathlib import Path + +from .anonymization import PoolAnonymizer, RandomAnonymizer, GANAnonymizer +from .speaker_embeddings import SpeakerEmbeddings + + +class SpeakerAnonymization: + + def __init__(self, vectors_dir, device, settings, results_dir=None, save_intermediate=True, force_compute=False): + self.vectors_dir = vectors_dir + self.device = device + self.save_intermediate = save_intermediate + self.force_compute = force_compute if force_compute else settings.get('force_compute_anonymization', False) + + self.vec_type = settings['vec_type'] + self.emb_level = settings['emb_level'] + + if results_dir: + self.results_dir = results_dir + elif 'anon_results_path' in settings: + self.results_dir = settings['anon_results_path'] + elif 'results_dir' in settings: + self.results_dir = settings['results_dir'] + else: + if self.save_intermediate: + raise ValueError('Results dir must be specified in parameters or settings!') + + self.anonymizer = self._load_anonymizer(settings) + + def anonymize_embeddings(self, speaker_embeddings, dataset_name): + dataset_results_dir = self.results_dir / dataset_name if self.save_intermediate else '' + + if dataset_results_dir.exists() and any(dataset_results_dir.iterdir()) and not speaker_embeddings.new and not\ + self.force_compute: + # if there are already anonymized speaker embeddings from this model and the computation is not forced, + # simply load them + print('No computation of anonymized embeddings necessary; load existing anonymized speaker embeddings ' + 'instead...') + anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level=self.emb_level, device=self.device) + anon_embeddings.load_vectors(dataset_results_dir) + return anon_embeddings + else: + # otherwise, create new anonymized speaker embeddings + print('Anonymize speaker embeddings...') + anon_embeddings = self.anonymizer.anonymize_embeddings(speaker_embeddings, emb_level=self.emb_level) + + if self.save_intermediate: + anon_embeddings.save_vectors(dataset_results_dir) + return anon_embeddings + + def _load_anonymizer(self, settings): + anon_method = settings['anon_method'] + vec_type = settings.get('vec_type', 'xvector') + model_name = settings.get('anon_name', None) + + if anon_method == 'random': + anon_settings = settings.get('random_anon_settings', {}) + model = RandomAnonymizer(vec_type=vec_type, device=self.device, model_name=model_name, **anon_settings) + + elif anon_method == 'pool': + anon_settings = settings.get('pool_anon_settings', {}) + model = PoolAnonymizer(vec_type=vec_type, device=self.device, model_name=model_name, + embed_model_dir=settings.get('embed_model_path', Path()), + save_intermediate=self.save_intermediate, **anon_settings) + + elif anon_method == 'gan': + anon_settings = settings.get('gan_anon_settings', {}) + model = GANAnonymizer(vec_type=vec_type, device=self.device, model_name=model_name, + save_intermediate=self.save_intermediate, **anon_settings) + else: + raise ValueError(f'Unknown anonymization method {anon_method}') + + print(f'Model type of anonymizer: {model_name}') + return model diff --git a/anonymization/modules/speaker_embeddings/speaker_embeddings.py b/anonymization/modules/speaker_embeddings/speaker_embeddings.py new file mode 100644 index 0000000..5697a73 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/speaker_embeddings.py @@ -0,0 +1,144 @@ +from collections import defaultdict +from pathlib import Path +import torch + +from utils import read_kaldi_format, save_kaldi_format, create_clean_dir + + +class SpeakerEmbeddings: + + def __init__(self, vec_type='xvector', vec_level='spk', device=torch.device('cpu')): + self.vec_type = vec_type + self.vec_level = vec_level + self.device = device + + self.identifiers2idx = {} + self.idx2identifiers = {} + self.vectors = None + self.original_speakers = [] + self.genders = [] + + self.new = True + + def __iter__(self): + assert self.identifiers2idx and self.vectors is not None, \ + 'Speaker vectors need to be extracted or loaded before they can be iterated!' + + for identifier, idx in sorted(self.identifiers2idx.items(), key=lambda x: x[1]): + yield identifier, self.vectors[idx] + + def __len__(self): + return len(self.identifiers2idx) + + def __getitem__(self, item): + assert (self.identifiers2idx is not None) and (self.vectors is not None), \ + 'Speaker vectors need to be extracted or loaded before they can be accessed!' + assert item <= len(self), 'Index needs to be smaller or equal the number of speakers!' + return self.idx2identifiers[item], self.vectors[item] + + def add_vector(self, identifier, vector, speaker, gender): + idx = len(self) + if not self.vectors: + self.vectors = torch.tensor(vector) + else: + self.vectors = torch.cat((self.vectors, vector), 0) + self.identifiers2idx[identifier] = idx + self.idx2identifiers[idx] = identifier + self.original_speakers.append(speaker) + self.genders.append(gender) + + def set_vectors(self, identifiers, vectors, speakers, genders): + if not isinstance(identifiers, dict): + self.identifiers2idx = {identifier: idx for idx, identifier in enumerate(identifiers)} + else: + self.identifiers2idx = identifiers + self.vectors = torch.tensor(vectors) if not isinstance(vectors, torch.Tensor) else vectors + self.genders = genders + self.original_speakers = speakers + self.idx2identifiers = {idx: identifier for identifier, idx in self.identifiers2idx.items()} + + def add_vectors(self, identifiers, vectors, speakers, genders): + if not isinstance(identifiers, dict): + identifiers = {identifier: idx for idx, identifier in enumerate(identifiers)} + + new_identifiers = list(identifiers.keys() - self.identifiers2idx.keys()) + indices = [identifiers[iden] for iden in new_identifiers] + last_known_index = len(self) + + new_iden_dict = {iden: last_known_index + i for i, iden in enumerate(new_identifiers)} + self.identifiers2idx.update(new_iden_dict) + self.idx2identifiers.update({idx: iden for iden, idx in new_iden_dict.items()}) + if not self.vectors: + self.vectors = torch.tensor(vectors[indices]) + else: + self.vectors = torch.cat((self.vectors, vectors[indices]), dim=0) + self.genders.extend([genders[idx] for idx in indices]) + self.original_speakers.extend([speakers[idx] for idx in indices]) + + def load_vectors(self, in_dir: Path): + assert (in_dir / f'id2idx').exists() and (in_dir / f'speaker_vectors.pt').exists(), \ + f'speaker_vectors.pt and id2idx must exist in {in_dir}!' + + idx2spk = read_kaldi_format(in_dir / 'idx2spk') + spk2gender = read_kaldi_format(in_dir / 'spk2gender') + self.original_speakers = [spk for idx, spk in sorted(idx2spk.items(), key=lambda x: x[0])] + self.genders = [spk2gender[spk] for spk in self.original_speakers] + self.vectors = torch.load(in_dir / f'speaker_vectors.pt', map_location=self.device) + + self.identifiers2idx = {id: int(idx) for id, idx in read_kaldi_format(in_dir / f'id2idx').items()} + self.idx2identifiers = {idx: identifier for identifier, idx in self.identifiers2idx.items()} + + self.new = False + + def save_vectors(self, out_dir: Path): + assert (self.identifiers2idx is not None) and (self.vectors is not None), \ + 'Speaker vectors need to be extracted or loaded before they can be stored!' + create_clean_dir(out_dir) + + save_kaldi_format(self.identifiers2idx, out_dir / f'id2idx') + + idx2spk = dict(zip(list(range(len(self))), self.original_speakers)) + save_kaldi_format(idx2spk, out_dir / f'idx2spk') + + spk2gender = self.get_spk2gender() + save_kaldi_format(spk2gender, out_dir / 'spk2gender') + + torch.save(self.vectors, out_dir / f'speaker_vectors.pt') + + def get_embedding_for_identifier(self, identifier): + idx = self.identifiers2idx[identifier] + return self.vectors[int(idx)] + + def get_speaker(self, identifier): + idx = self.identifiers2idx[identifier] + return self.original_speakers[int(idx)] + + def get_utt_list(self): + return [identifier for identifier, idx in sorted(self.identifiers2idx.items(), key=lambda x: x[1])] + + def get_spk2gender(self): + return {speaker: gender for speaker, gender in zip(self.original_speakers, self.genders)} + + def convert_to_spk_level(self, method='average'): + assert self.vec_level == 'utt', \ + 'Speaker embeddings must be on utterance level to be able to convert them to speaker level!' + + if method == 'average': + spk2idx = defaultdict(list) + for i, speaker in enumerate(self.original_speakers): + spk2idx[speaker].append(i) + + spk_level_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level='spk', device=self.device) + spk_vectors, speakers, genders = [], [], [] + if not isinstance(self.vectors, torch.Tensor): + self.vectors = torch.tensor(self.vectors) + for speaker, idx_list in spk2idx.items(): + spk_vectors.append(torch.mean(self.vectors[idx_list], dim=0)) + speakers.append(speaker) + genders.append(self.genders[idx_list[0]]) + spk_level_embeddings.set_vectors(identifiers=speakers, vectors=torch.stack(spk_vectors, dim=0), + speakers=speakers, genders=genders) + + return spk_level_embeddings + else: + return self diff --git a/anonymization/modules/speaker_embeddings/speaker_extraction.py b/anonymization/modules/speaker_embeddings/speaker_extraction.py new file mode 100644 index 0000000..9fcbf87 --- /dev/null +++ b/anonymization/modules/speaker_embeddings/speaker_extraction.py @@ -0,0 +1,139 @@ +from tqdm import tqdm +from pathlib import Path +import torch +import torchaudio +from tqdm.contrib.concurrent import process_map +import time +from torch.multiprocessing import set_start_method +from itertools import repeat +import numpy as np + +from .extraction.embedding_methods import SpeechBrainVectors, StyleEmbeddings +from .extraction.ims_speaker_extraction_methods import normalize_wave +from .speaker_embeddings import SpeakerEmbeddings +from utils import read_kaldi_format + +set_start_method('spawn', force=True) + + +class SpeakerExtraction: + + def __init__(self, devices: list, settings: dict, results_dir: Path = None, model_dir: Path = None, + save_intermediate=True, force_compute=False): + self.devices = devices + self.n_processes = len(self.devices) + self.save_intermediate = save_intermediate + self.force_compute = force_compute if force_compute else settings.get('force_compute_extraction', False) + + self.vec_type = settings['vec_type'] + self.vec_level = settings['vec_level'] + + if results_dir: + self.results_dir = results_dir + elif 'extraction_results_path' in settings: + self.results_dir = settings['extraction_results_path'] + elif 'results_dir' in settings: + self.results_dir = settings['results_dir'] + else: + if self.save_intermediate: + raise ValueError('Results dir must be specified in parameters or settings!') + + self.model_hparams = { + 'vec_type': self.vec_type, + 'model_path': settings.get('vec_model_path') or model_dir + } + + if self.n_processes > 1: + self.extractors = None + else: + self.extractors = create_extractors(hparams=self.model_hparams, device=self.devices[0]) + + def extract_speakers(self, dataset_path, dataset_name=None, vec_level=None): + dataset_name = dataset_name if dataset_name is not None else dataset_path.name + dataset_results_dir = self.results_dir / dataset_name if self.save_intermediate else Path('') + utt2spk = read_kaldi_format(dataset_path / 'utt2spk') + wav_scp = read_kaldi_format(dataset_path / 'wav.scp') + spk2gender = read_kaldi_format(dataset_path / 'spk2gender') + vec_level = vec_level if vec_level is not None else self.vec_level + + speaker_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, vec_level='utt', device=self.devices[0]) + + if (dataset_results_dir / 'speaker_vectors.pt').exists() and not self.force_compute: + print('No speaker extraction necessary; load existing embeddings instead...') + speaker_embeddings.load_vectors(dataset_results_dir) + else: + print(f'Extract embeddings of {len(wav_scp)} utterances') + speaker_embeddings.new = True + + if self.n_processes > 1: + sleeps = [10 * i for i in range(self.n_processes)] + indices = np.array_split(np.arange(len(wav_scp)), self.n_processes) + wav_scp_items = list(wav_scp.items()) + wav_scp_list = [dict([wav_scp_items[ind] for ind in chunk]) for chunk in indices] + # multiprocessing + job_params = zip(wav_scp_list, repeat(self.extractors), sleeps, self.devices, + repeat(self.model_hparams), list(range(self.n_processes))) + returns = process_map(extraction_job, job_params, max_workers=self.n_processes) + vectors = torch.concat([x[0].to(self.devices[0]) for x in returns], dim=0) + utts = [x[1] for x in returns] + utts = list(np.concatenate(utts)) + else: + vectors, utts = extraction_job([wav_scp, self.extractors, 0, self.devices[0], self.model_hparams, 0]) + vectors = torch.stack(vectors, dim=0) + + speakers = [utt2spk[utt] for utt in utts] + genders = [spk2gender[speaker] for speaker in speakers] + + speaker_embeddings.set_vectors(vectors=vectors, identifiers=utts, speakers=speakers, genders=genders) + + if vec_level == 'spk': + speaker_embeddings = speaker_embeddings.convert_to_spk_level() + if self.save_intermediate: + speaker_embeddings.save_vectors(dataset_results_dir) + + return speaker_embeddings + + +def create_extractors(hparams, device): + extractors = [] + for single_vec_type in hparams['vec_type'].split('+'): + if single_vec_type in {'xvector', 'ecapa'}: + extractors.append(SpeechBrainVectors(vec_type=single_vec_type, model_path=Path(hparams['model_path']), + device=device)) + elif single_vec_type == 'style-embed': + extractors.append(StyleEmbeddings(model_path=Path(hparams['model_path']), device=device)) + else: + raise ValueError(f'Invalid vector type {single_vec_type}!') + + return extractors + + +def extraction_job(data): + wav_scp, speaker_extractors, sleep, device, model_hparams, job_id = data + time.sleep(sleep) + + if speaker_extractors is None: + speaker_extractors = create_extractors(hparams=model_hparams, device=device) + + vectors = [] + utts = [] + for utt, wav_path in tqdm(wav_scp.items(), desc=f'Job {job_id}', leave=True): + if isinstance(wav_path, list): + wav_path = wav_path[1] + signal, fs = torchaudio.load(wav_path) + norm_wave = normalize_wave(signal, fs, device=device) + + try: + spk_embs = [extractor.extract_vector(audio=norm_wave, sr=fs) for extractor in speaker_extractors] + except RuntimeError: + print(f'Runtime error: {utt}, {signal.shape}, {norm_wave.shape}') + continue + + if len(spk_embs) == 1: + vector = spk_embs[0] + else: + vector = torch.cat(spk_embs, dim=0) + vectors.append(vector) + utts.append(utt) + + return vectors, utts diff --git a/anonymization/modules/text/__init__.py b/anonymization/modules/text/__init__.py new file mode 100644 index 0000000..5a94596 --- /dev/null +++ b/anonymization/modules/text/__init__.py @@ -0,0 +1 @@ +from .speech_recognition import SpeechRecognition \ No newline at end of file diff --git a/anonymization/modules/text/recognition/__init__.py b/anonymization/modules/text/recognition/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/anonymization/modules/text/recognition/ims_asr.py b/anonymization/modules/text/recognition/ims_asr.py new file mode 100644 index 0000000..44c8b31 --- /dev/null +++ b/anonymization/modules/text/recognition/ims_asr.py @@ -0,0 +1,59 @@ +import torch +torch.set_num_threads(1) +from espnet2.bin.asr_inference import Speech2Text +import soundfile +import resampy +from espnet_model_zoo.downloader import ModelDownloader, str_to_hash + +from utils.data_io import parse_yaml + + +class ImsASR: + + def __init__(self, model_path, device, ctc_weight=0.2, utt_start_token='', utt_end_token='', **kwargs): + self.device = device + self.model_path = model_path + self.ctc_weight = ctc_weight + self.utt_start_token = utt_start_token + self.utt_end_token = utt_end_token + + + # It is not sufficient to simply unzip the model.zip folder because this would not set up the environment + # correctly. Instead, we have to call the ModelDownloader routine at least once before we can use the model. + # However, we do not want to run this every time, so we check first if the unzipped model (stored by hash + # value) already exists + cache_path = model_path.parent + d = ModelDownloader(cachedir=cache_path) + local_url = str(model_path.absolute()) + hash = str_to_hash(local_url) + + if (cache_path / hash).exists(): + yaml = parse_yaml(cache_path / hash / 'meta.yaml') + asr_model_file = str(cache_path / hash / yaml['files']['asr_model_file']) + asr_train_config = str(cache_path / hash / yaml['yaml_files']['asr_train_config']) + else: + model_files = d.download_and_unpack(local_url) + asr_train_config = model_files['asr_train_config'] + asr_model_file = model_files['asr_model_file'] + + self.speech2text = Speech2Text(asr_train_config=asr_train_config, + asr_model_file=asr_model_file, + device=str(self.device), + minlenratio=0.0, + maxlenratio=0.0, + ctc_weight=ctc_weight, + beam_size=15, + batch_size=1, + nbest=1, + quantize_asr_model=False) + + self.output = 'phones' if '-phn' in model_path.name else 'text' + + def recognize_speech_of_audio(self, audio_file): + speech, rate = soundfile.read(audio_file) + speech = torch.tensor(resampy.resample(speech, rate, 16000), device=self.device) + + nbests = self.speech2text(speech) + text, *_ = nbests[0] + text = self.utt_start_token + text + self.utt_end_token + return text diff --git a/anonymization/modules/text/speech_recognition.py b/anonymization/modules/text/speech_recognition.py new file mode 100644 index 0000000..bf4dbf6 --- /dev/null +++ b/anonymization/modules/text/speech_recognition.py @@ -0,0 +1,149 @@ +from tqdm import tqdm +from tqdm.contrib.concurrent import process_map +import time +from torch.multiprocessing import set_start_method +from itertools import repeat +import numpy as np +from pathlib import Path + +from .text import Text +from .recognition.ims_asr import ImsASR +from utils import read_kaldi_format + +set_start_method('spawn', force=True) + + +class SpeechRecognition: + + def __init__(self, devices, settings, results_dir=None, save_intermediate=True, force_compute=False): + self.devices = devices + self.save_intermediate = save_intermediate + self.force_compute = force_compute if force_compute else settings.get('force_compute_recognition', False) + self.n_processes = len(self.devices) + + self.model_hparams = settings + + if results_dir: + self.results_dir = results_dir + elif 'results_path' in settings: + self.results_dir = settings['results_path'] + elif 'results_dir' in settings: + self.results_dir = settings['results_dir'] + else: + if self.save_intermediate: + raise ValueError('Results dir must be specified in parameters or settings!') + + self.asr_model = create_model_instance(hparams=self.model_hparams, device=devices[0]) + self.is_phones = (self.asr_model.output == 'phones') + + if self.n_processes > 1: + self.asr_model = None + + def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None): + dataset_name = dataset_name if dataset_name else dataset_path.name + dataset_results_dir = self.results_dir / dataset_name if self.save_intermediate else Path('') + + utt2spk = read_kaldi_format(dataset_path / 'utt2spk') + texts = Text(is_phones=self.is_phones) + + if (dataset_results_dir / 'text').exists() and not self.force_compute: + # if the text created from this ASR model already exists for this dataset and a computation is not + # forced, simply load the text + texts.load_text(in_dir=dataset_results_dir) + + if len(texts) == len(utt2spk): + print('No speech recognition necessary; load existing text instead...') + else: + if len(texts) > 0: + print(f'No speech recognition necessary for {len(texts)} of {len(utt2spk)} utterances') + # otherwise, recognize the speech + dataset_results_dir.mkdir(exist_ok=True, parents=True) + print(f'Recognize speech of {len(utt2spk)} utterances...') + wav_scp = read_kaldi_format(dataset_path / 'wav.scp') + + utterances = [] + for utt, spk in utt2spk.items(): + if utt in texts.utterances: + continue + if utterance_list and utt not in utterance_list: + continue + if utt in wav_scp: + utterances.append((utt, spk, wav_scp[utt])) + + save_intermediate = self.save_intermediate and not utterance_list + start = time.time() + + if self.n_processes == 1: + new_texts = [recognition_job([utterances, self.asr_model, + dataset_results_dir, 0, self.devices[0], self.model_hparams, None, + save_intermediate])] + else: + sleeps = [10 * i for i in range(self.n_processes)] + indices = np.array_split(np.arange(len(utterances)), self.n_processes) + utterance_jobs = [[utterances[ind] for ind in chunk] for chunk in indices] + # multiprocessing + job_params = zip(utterance_jobs, repeat(self.asr_model), repeat(dataset_results_dir), sleeps, + self.devices, repeat(self.model_hparams), list(range(self.n_processes)), + repeat(save_intermediate)) + new_texts = process_map(recognition_job, job_params, max_workers=self.n_processes) + + end = time.time() + total_time = round(end - start, 2) + print(f'Total time for speech recognition: {total_time} seconds ({round(total_time / 60, 2)} minutes / ' + f'{round(total_time / 60 / 60, 2)} hours)') + texts = self._combine_texts(main_text_instance=texts, additional_text_instances=new_texts) + + if save_intermediate: + texts.save_text(out_dir=dataset_results_dir) + self._remove_temp_files(out_dir=dataset_results_dir) + + return texts + + def _combine_texts(self, main_text_instance, additional_text_instances): + for add_text_instance in additional_text_instances: + main_text_instance.add_instances(sentences=add_text_instance.sentences, + utterances=add_text_instance.utterances, + speakers=add_text_instance.speakers) + + return main_text_instance + + def _remove_temp_files(self, out_dir): + temp_text_files = [filename for filename in out_dir.glob('text*') if filename.name != 'text'] + temp_utt2spk_files = [filename for filename in out_dir.glob('utt2spk*') if filename.name != 'utt2spk'] + + for file in temp_text_files + temp_utt2spk_files: + file.unlink() + + +def create_model_instance(hparams, device): + recognizer = hparams.get('recognizer') + if recognizer == 'ims': + return ImsASR(**hparams, device=device) + else: + raise ValueError(f'Invalid recognizer option: {recognizer}') + + +def recognition_job(data): + utterances, asr_model, out_dir, sleep, device, model_hparams, job_id, save_intermediate = data + time.sleep(sleep) + + add_suffix = f'_{job_id}' if job_id is not None else None + job_id = job_id or 0 + + if asr_model is None: + asr_model = create_model_instance(hparams=model_hparams, device=device) + + texts = Text(is_phones=(asr_model.output == 'phones')) + i = 0 + for utt, spk, wav_path in tqdm(utterances, desc=f'Job {job_id}', leave=True): + sentence = asr_model.recognize_speech_of_audio(audio_file=wav_path) + texts.add_instance(sentence=sentence, utterance=utt, speaker=spk) + + i += 1 + if i % 100 == 0 and save_intermediate: + texts.save_text(out_dir=out_dir, add_suffix=add_suffix) + + if save_intermediate: + texts.save_text(out_dir=out_dir, add_suffix=add_suffix) + + return texts diff --git a/anonymization/modules/text/text.py b/anonymization/modules/text/text.py new file mode 100644 index 0000000..d693822 --- /dev/null +++ b/anonymization/modules/text/text.py @@ -0,0 +1,122 @@ +from pathlib import Path +import numpy as np + +from utils import read_kaldi_format, save_kaldi_format + + +class Text: + + def __init__(self, is_phones=False): + self.sentences = [] + self.utterances = [] + self.speakers = [] + self.utt2idx = {} + + self.new = True + self.is_phones = is_phones + + def __len__(self): + return len(self.utterances) + + def __iter__(self): + for i in range(len(self)): + yield self.sentences[i], self.utterances[i], self.speakers[i] + + def __getitem__(self, utt): + return self.get_instance(utt)[0] + + def add_instance(self, sentence, utterance, speaker): + self.utt2idx[utterance] = len(self) + self.sentences.append(sentence) + self.utterances.append(utterance) + self.speakers.append(speaker) + + def add_instances(self, sentences, utterances, speakers): + if len(set(utterances) & set(self.utterances)) > 0: + remove_indices = [i for i, utt in enumerate(utterances) if utt in self.utterances] + for idx in reversed(remove_indices): + del sentences[idx] + del utterances[idx] + del speakers[idx] + + self.utt2idx.update({utt: len(self) + i for i, utt in enumerate(utterances)}) + self.sentences.extend(sentences) + self.utterances.extend(utterances) + self.speakers.extend(speakers) + + def get_instance(self, utterance): + idx = self.utt2idx[utterance] + return self.sentences[idx], self.speakers[idx] + + def get_iterators(self, n): + # divides the stored data into n packages and returns a list of iterators over each package + # like __iter__, but with several iterators + + def _get_instance_by_indices(indices): + for i in indices: + yield self.sentences[i], self.utterances[i], self.speakers[i] + + iterator_length = len(self) // n + it_slices = [[iterator_length * i, iterator_length * (i + 1)] if i < (n - 1) + else [iterator_length * i, len(self)] for i in range(n)] + iterators = [_get_instance_by_indices(range(*it_slice)) for it_slice in it_slices] + return iterators + + def update_instance(self, utterance, sentence): + idx = self.utt2idx[utterance] + self.sentences[idx] = sentence + + def remove_instances(self, utterance_list): + remove_indices = [self.utt2idx[utt] for utt in utterance_list if utt in self.utt2idx] + remove_indices = sorted(remove_indices, reverse=True) + for idx in remove_indices: + del self.sentences[idx] + del self.utterances[idx] + del self.speakers[idx] + self.utt2idx = {utt: i for i, utt in enumerate(self.utterances)} + + def get_instances_of_speaker(self, speaker): + indices = [i for (i, spk) in enumerate(self.speakers) if spk == speaker] + sentences = [self.sentences[i] for i in indices] + utterances = [self.utterances[i] for i in indices] + return sentences, utterances + + def shuffle(self): + shuffled_sentences = [] + shuffled_utterances = [] + shuffled_speakers = [] + for i in np.random.permutation(len(self)): + shuffled_sentences.append(self.sentences[i]) + shuffled_utterances.append(self.utterances[i]) + shuffled_speakers.append(self.speakers[i]) + self.sentences = shuffled_sentences + self.utterances = shuffled_utterances + self.speakers = shuffled_speakers + self.utt2idx = {utt: i for i, utt in enumerate(self.utterances)} + + def save_text(self, out_dir: Path, add_suffix=None): + out_dir.mkdir(exist_ok=True, parents=True) + add_suffix = add_suffix if add_suffix is not None else "" + save_kaldi_format(data=list(zip(self.utterances, self.sentences)), + filename=out_dir / f'text{add_suffix}') + save_kaldi_format(data=list(zip(self.utterances, self.speakers)), + filename=out_dir / f'utt2spk{add_suffix}') + + def load_text(self, in_dir, add_suffix=None): + self.new = False + add_suffix = add_suffix if add_suffix is not None else "" + utt_1, sentences = read_kaldi_format(filename=in_dir / f'text{add_suffix}', return_as_dict=False, + values_as_string=True) + utt_2, speakers = read_kaldi_format(filename=in_dir / f'utt2spk{add_suffix}', return_as_dict=False) + + if utt_1 == utt_2: + self.utterances = utt_1 + self.sentences = sentences + self.speakers = speakers + elif sorted(utt_1) == sorted(utt_2): + self.utterances, self.sentences = zip(*sorted(zip(utt_1, sentences), key=lambda x: x[0])) + _, self.speakers = zip(*sorted(zip(utt_2, speakers), key=lambda x: x[0])) + else: + raise ValueError(f'{in_dir / f"text{add_suffix}"} and {in_dir / f"utt2spk{add_suffix}"} have mismatching ' + f'utterance keys; sentences cannot be loaded!') + self.utt2idx = {utt: i for i, utt in enumerate(self.utterances)} diff --git a/anonymization/modules/tts/IMSToucan/.gitignore b/anonymization/modules/tts/IMSToucan/.gitignore new file mode 100644 index 0000000..1b75bcf --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/.gitignore @@ -0,0 +1,20 @@ +.idea +*.pyc +*.png +*.pdf +tensorboard_logs +Corpora +Models +*_graph +*.out +*.wav +audios/notes.txt +audios/ +*playground* +apex/ +pretrained_models/ +*.json +.tmp/ +.vscode/ +split/ +singing/ \ No newline at end of file diff --git a/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py new file mode 100644 index 0000000..8ecd359 --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/AnonFastSpeech2.py @@ -0,0 +1,223 @@ +import itertools +import os + +import librosa.display as lbd +import matplotlib.pyplot as plt +import noisereduce +import sounddevice +import soundfile +import torch + +from ..InferenceInterfaces.InferenceArchitectures.InferenceFastSpeech2 import FastSpeech2 +from ..InferenceInterfaces.InferenceArchitectures.InferenceHiFiGAN import HiFiGANGenerator +from ..Preprocessing.AudioPreprocessor import AudioPreprocessor +from ..Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend +from ..Preprocessing.TextFrontend import get_language_id +from ..TrainingInterfaces.Spectrogram_to_Embedding.StyleEmbedding import StyleEmbedding + + +class AnonFastSpeech2(torch.nn.Module): + + def __init__(self, path_to_hifigan_model, path_to_fastspeech_model, path_to_embed_model, device="cpu", language="en", noise_reduce=False): + super().__init__() + self.device = device + self.audio_preprocessor = AudioPreprocessor(input_sr=16000, output_sr=16000, cut_silence=True, device=self.device) + self.text2phone = ArticulatoryCombinedTextFrontend(language=language, add_silence_to_end=True) + checkpoint = torch.load(path_to_fastspeech_model, map_location='cpu') + try: + self.use_lang_id = False + self.phone2mel = FastSpeech2(weights=checkpoint["model"]).to(torch.device(device)) + except RuntimeError: + print("Loading a multilingual model, which is strange for this purpose. Please double check that the correct model is being loaded.") + self.use_lang_id = True + self.phone2mel = FastSpeech2(weights=checkpoint["model"], lang_embs=1000).to(torch.device(device)) + self.mel2wav = HiFiGANGenerator(path_to_weights=path_to_hifigan_model).to(torch.device(device)) + self.style_embedding_function = StyleEmbedding() + check_dict = torch.load(path_to_embed_model, map_location="cpu") + self.style_embedding_function.load_state_dict(check_dict["style_emb_func"]) + self.style_embedding_function.to(self.device) + self.default_utterance_embedding = checkpoint["default_emb"].to(self.device) + self.phone2mel.eval() + self.mel2wav.eval() + if self.use_lang_id: + self.lang_id = get_language_id(language) + else: + self.lang_id = None + self.to(torch.device(device)) + self.noise_reduce = noise_reduce + if self.noise_reduce: + self.prototypical_noise = None + self.update_noise_profile() + + def set_utterance_embedding(self, path_to_reference_audio="", embedding=None): + if embedding is not None: + self.default_utterance_embedding = embedding.squeeze().to(self.device) + return + assert os.path.exists(path_to_reference_audio) + wave, sr = soundfile.read(path_to_reference_audio) + if sr != self.audio_preprocessor.sr: + self.audio_preprocessor = AudioPreprocessor(input_sr=sr, output_sr=16000, cut_silence=True, device=self.device) + spec = self.audio_preprocessor.audio_to_mel_spec_tensor(wave).transpose(0, 1) + spec_len = torch.LongTensor([len(spec)]) + self.default_utterance_embedding = self.style_embedding_function(spec.unsqueeze(0).to(self.device), + spec_len.unsqueeze(0).to(self.device)).squeeze() + + def set_language(self, lang_id): + """ + The id parameter actually refers to the shorthand. This has become ambiguous with the introduction of the actual language IDs + """ + self.set_phonemizer_language(lang_id=lang_id) + self.set_accent_language(lang_id=lang_id) + + def set_phonemizer_language(self, lang_id): + self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True) + + def set_accent_language(self, lang_id): + if self.use_lang_id: + self.lang_id = get_language_id(lang_id).to(self.device) + else: + self.lang_id = None + + def forward(self, + text, + view=False, + duration_scaling_factor=1.0, + pitch_variance_scale=1.0, + energy_variance_scale=1.0, + durations=None, + pitch=None, + energy=None, + text_is_phonemes=False): + """ + duration_scaling_factor: reasonable values are 0.5 < scale < 1.5. + 1.0 means no scaling happens, higher values increase durations for the whole + utterance, lower values decrease durations for the whole utterance. + pitch_variance_scale: reasonable values are 0.0 < scale < 2.0. + 1.0 means no scaling happens, higher values increase variance of the pitch curve, + lower values decrease variance of the pitch curve. + energy_variance_scale: reasonable values are 0.0 < scale < 2.0. + 1.0 means no scaling happens, higher values increase variance of the energy curve, + lower values decrease variance of the energy curve. + """ + with torch.inference_mode(): + phones = self.text2phone.string_to_tensor(text, input_phonemes=text_is_phonemes).to(torch.device(self.device)) + mel, durations, pitch, energy = self.phone2mel(phones, + return_duration_pitch_energy=True, + utterance_embedding=self.default_utterance_embedding, + durations=durations, + pitch=pitch, + energy=energy, + lang_id=self.lang_id) + mel = mel.transpose(0, 1) + wave = self.mel2wav(mel) + if view: + from ..Utility.utils import cumsum_durations + fig, ax = plt.subplots(nrows=2, ncols=1) + ax[0].plot(wave.cpu().numpy()) + lbd.specshow(mel.cpu().numpy(), + ax=ax[1], + sr=16000, + cmap='GnBu', + y_axis='mel', + x_axis=None, + hop_length=256) + ax[0].yaxis.set_visible(False) + ax[1].yaxis.set_visible(False) + duration_splits, label_positions = cumsum_durations(durations.cpu().numpy()) + ax[1].set_xticks(duration_splits, minor=True) + ax[1].xaxis.grid(True, which='minor') + ax[1].set_xticks(label_positions, minor=False) + ax[1].set_xticklabels(self.text2phone.get_phone_string(text, for_plot_labels=True)) + ax[0].set_title(text) + plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=.9, wspace=0.0, hspace=0.0) + plt.show() + if self.noise_reduce: + wave = torch.tensor(noisereduce.reduce_noise(y=wave.cpu().numpy(), y_noise=self.prototypical_noise, sr=48000, stationary=True), device=self.device) + return wave + + def read_to_file(self, + text_list, + file_location, + duration_scaling_factor=1.0, + pitch_variance_scale=1.0, + energy_variance_scale=1.0, + silent=False, + dur_list=None, + pitch_list=None, + energy_list=None): + """ + Args: + silent: Whether to be verbose about the process + text_list: A list of strings to be read + file_location: The path and name of the file it should be saved to + energy_list: list of energy tensors to be used for the texts + pitch_list: list of pitch tensors to be used for the texts + dur_list: list of duration tensors to be used for the texts + duration_scaling_factor: reasonable values are 0.5 < scale < 1.5. + 1.0 means no scaling happens, higher values increase durations for the whole + utterance, lower values decrease durations for the whole utterance. + pitch_variance_scale: reasonable values are 0.0 < scale < 12.0. + 1.0 means no scaling happens, higher values increase variance of the pitch curve, + lower values decrease variance of the pitch curve. + energy_variance_scale: reasonable values are 0.0 < scale < 2.0. + 1.0 means no scaling happens, higher values increase variance of the energy curve, + lower values decrease variance of the energy curve. + """ + if not dur_list: + dur_list = [] + if not pitch_list: + pitch_list = [] + if not energy_list: + energy_list = [] + wav = None + silence = torch.zeros([24000]) + for (text, durations, pitch, energy) in itertools.zip_longest(text_list, dur_list, pitch_list, energy_list): + if text.strip() != "": + if not silent: + print("Now synthesizing: {}".format(text)) + if wav is None: + if durations is not None: + durations = durations.to(self.device) + if pitch is not None: + pitch = pitch.to(self.device) + if energy is not None: + energy = energy.to(self.device) + wav = self(text, + durations=durations, + pitch=pitch, + energy=energy, + duration_scaling_factor=duration_scaling_factor, + pitch_variance_scale=pitch_variance_scale, + energy_variance_scale=energy_variance_scale).cpu() + wav = torch.cat((wav, silence), 0) + else: + wav = torch.cat((wav, self(text, + durations=durations.to(self.device), + pitch=pitch.to(self.device), + energy=energy.to(self.device), + duration_scaling_factor=duration_scaling_factor, + pitch_variance_scale=pitch_variance_scale, + energy_variance_scale=energy_variance_scale).cpu()), 0) + wav = torch.cat((wav, silence), 0) + soundfile.write(file=file_location, data=wav.cpu().numpy(), samplerate=48000) + + def read_aloud(self, + text, + view=False, + duration_scaling_factor=1.0, + pitch_variance_scale=1.0, + energy_variance_scale=1.0, + blocking=False): + if text.strip() == "": + return + wav = self(text, + view, + duration_scaling_factor=duration_scaling_factor, + pitch_variance_scale=pitch_variance_scale, + energy_variance_scale=energy_variance_scale).cpu() + wav = torch.cat((wav, torch.zeros([24000])), 0) + if not blocking: + sounddevice.play(wav.numpy(), samplerate=48000) + else: + sounddevice.play(torch.cat((wav, torch.zeros([12000])), 0).numpy(), samplerate=48000) + sounddevice.wait() diff --git a/anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/InferenceFastSpeech2.py b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/InferenceFastSpeech2.py new file mode 100644 index 0000000..6b70f3e --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/InferenceFastSpeech2.py @@ -0,0 +1,310 @@ +from abc import ABC + +import torch + +from ...Layers.Conformer import Conformer +from ...Layers.DurationPredictor import DurationPredictor +from ...Layers.LengthRegulator import LengthRegulator +from ...Layers.PostNet import PostNet +from ...Layers.VariancePredictor import VariancePredictor +from ...Utility.utils import make_non_pad_mask +from ...Utility.utils import make_pad_mask + + +class FastSpeech2(torch.nn.Module, ABC): + + def __init__(self, # network structure related + weights, + idim=62, + odim=80, + adim=384, + aheads=4, + elayers=6, + eunits=1536, + dlayers=6, + dunits=1536, + postnet_layers=5, + postnet_chans=256, + postnet_filts=5, + positionwise_conv_kernel_size=1, + use_scaled_pos_enc=True, + use_batch_norm=True, + encoder_normalize_before=True, + decoder_normalize_before=True, + encoder_concat_after=False, + decoder_concat_after=False, + reduction_factor=1, + # encoder / decoder + use_macaron_style_in_conformer=True, + use_cnn_in_conformer=True, + conformer_enc_kernel_size=7, + conformer_dec_kernel_size=31, + # duration predictor + duration_predictor_layers=2, + duration_predictor_chans=256, + duration_predictor_kernel_size=3, + # energy predictor + energy_predictor_layers=2, + energy_predictor_chans=256, + energy_predictor_kernel_size=3, + energy_predictor_dropout=0.5, + energy_embed_kernel_size=1, + energy_embed_dropout=0.0, + stop_gradient_from_energy_predictor=True, + # pitch predictor + pitch_predictor_layers=5, + pitch_predictor_chans=256, + pitch_predictor_kernel_size=5, + pitch_predictor_dropout=0.5, + pitch_embed_kernel_size=1, + pitch_embed_dropout=0.0, + stop_gradient_from_pitch_predictor=True, + # training related + transformer_enc_dropout_rate=0.2, + transformer_enc_positional_dropout_rate=0.2, + transformer_enc_attn_dropout_rate=0.2, + transformer_dec_dropout_rate=0.2, + transformer_dec_positional_dropout_rate=0.2, + transformer_dec_attn_dropout_rate=0.2, + duration_predictor_dropout_rate=0.2, + postnet_dropout_rate=0.5, + # additional features + utt_embed_dim=128, + lang_embs=None): + super().__init__() + self.idim = idim + self.odim = odim + self.reduction_factor = reduction_factor + self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor + self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor + self.use_scaled_pos_enc = use_scaled_pos_enc + self.multilingual_model = lang_embs is not None + self.multispeaker_model = utt_embed_dim is not None + + embed = torch.nn.Sequential(torch.nn.Linear(idim, 100), + torch.nn.Tanh(), + torch.nn.Linear(100, adim)) + self.encoder = Conformer(idim=idim, attention_dim=adim, attention_heads=aheads, linear_units=eunits, num_blocks=elayers, + input_layer=embed, dropout_rate=transformer_enc_dropout_rate, + positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate, + normalize_before=encoder_normalize_before, concat_after=encoder_concat_after, + positionwise_conv_kernel_size=positionwise_conv_kernel_size, macaron_style=use_macaron_style_in_conformer, + use_cnn_module=use_cnn_in_conformer, cnn_module_kernel=conformer_enc_kernel_size, zero_triu=False, + utt_embed=utt_embed_dim, lang_embs=lang_embs) + self.duration_predictor = DurationPredictor(idim=adim, n_layers=duration_predictor_layers, + n_chans=duration_predictor_chans, + kernel_size=duration_predictor_kernel_size, + dropout_rate=duration_predictor_dropout_rate, ) + self.pitch_predictor = VariancePredictor(idim=adim, n_layers=pitch_predictor_layers, + n_chans=pitch_predictor_chans, + kernel_size=pitch_predictor_kernel_size, + dropout_rate=pitch_predictor_dropout) + self.pitch_embed = torch.nn.Sequential(torch.nn.Conv1d(in_channels=1, out_channels=adim, + kernel_size=pitch_embed_kernel_size, + padding=(pitch_embed_kernel_size - 1) // 2), + torch.nn.Dropout(pitch_embed_dropout)) + self.energy_predictor = VariancePredictor(idim=adim, n_layers=energy_predictor_layers, + n_chans=energy_predictor_chans, + kernel_size=energy_predictor_kernel_size, + dropout_rate=energy_predictor_dropout) + self.energy_embed = torch.nn.Sequential(torch.nn.Conv1d(in_channels=1, out_channels=adim, + kernel_size=energy_embed_kernel_size, + padding=(energy_embed_kernel_size - 1) // 2), + torch.nn.Dropout(energy_embed_dropout)) + self.length_regulator = LengthRegulator() + self.decoder = Conformer(idim=0, + attention_dim=adim, + attention_heads=aheads, + linear_units=dunits, + num_blocks=dlayers, + input_layer=None, + dropout_rate=transformer_dec_dropout_rate, + positional_dropout_rate=transformer_dec_positional_dropout_rate, + attention_dropout_rate=transformer_dec_attn_dropout_rate, + normalize_before=decoder_normalize_before, + concat_after=decoder_concat_after, + positionwise_conv_kernel_size=positionwise_conv_kernel_size, + macaron_style=use_macaron_style_in_conformer, + use_cnn_module=use_cnn_in_conformer, + cnn_module_kernel=conformer_dec_kernel_size, + utt_embed=None) + self.feat_out = torch.nn.Linear(adim, odim * reduction_factor) + self.postnet = PostNet(idim=idim, + odim=odim, + n_layers=postnet_layers, + n_chans=postnet_chans, + n_filts=postnet_filts, + use_batch_norm=use_batch_norm, + dropout_rate=postnet_dropout_rate) + self.load_state_dict(weights) + + def _forward(self, text_tensors, text_lens, gold_speech=None, speech_lens=None, + gold_durations=None, gold_pitch=None, gold_energy=None, + is_inference=False, duration_scaling_factor=1.0, utterance_embedding=None, lang_ids=None, + pitch_variance_scale=1.0, energy_variance_scale=1.0): + + if not self.multilingual_model: + lang_ids = None + + if not self.multispeaker_model: + utterance_embedding = None + + # forward encoder + text_masks = self._source_mask(text_lens) + + encoded_texts, _ = self.encoder(text_tensors, text_masks, utterance_embedding=utterance_embedding, lang_ids=lang_ids) # (B, Tmax, adim) + + # forward duration predictor and variance predictors + duration_masks = make_pad_mask(text_lens, device=text_lens.device) + + if self.stop_gradient_from_pitch_predictor: + pitch_predictions = self.pitch_predictor(encoded_texts.detach(), duration_masks.unsqueeze(-1)) + else: + pitch_predictions = self.pitch_predictor(encoded_texts, duration_masks.unsqueeze(-1)) + + if self.stop_gradient_from_energy_predictor: + energy_predictions = self.energy_predictor(encoded_texts.detach(), duration_masks.unsqueeze(-1)) + else: + energy_predictions = self.energy_predictor(encoded_texts, duration_masks.unsqueeze(-1)) + + if is_inference: + if gold_durations is not None: + duration_predictions = gold_durations + else: + duration_predictions = self.duration_predictor.inference(encoded_texts, duration_masks) + if gold_pitch is not None: + pitch_predictions = gold_pitch + if gold_energy is not None: + energy_predictions = gold_energy + + for phoneme_index, phoneme_vector in enumerate(text_tensors.squeeze(0)): + if phoneme_vector[61] == 0: + pitch_predictions[0][phoneme_index] = 0.0 + pitch_predictions = _scale_variance(pitch_predictions, pitch_variance_scale) + energy_predictions = _scale_variance(energy_predictions, energy_variance_scale) + + pitch_embeddings = self.pitch_embed(pitch_predictions.transpose(1, 2)).transpose(1, 2) + energy_embeddings = self.energy_embed(energy_predictions.transpose(1, 2)).transpose(1, 2) + encoded_texts = encoded_texts + energy_embeddings + pitch_embeddings + encoded_texts = self.length_regulator(encoded_texts, duration_predictions, duration_scaling_factor) + else: + duration_predictions = self.duration_predictor(encoded_texts, duration_masks) + + for phoneme_index, phoneme_vector in enumerate(text_tensors): + if phoneme_vector[61] == 0: + pitch_predictions[phoneme_index] = 0.0 + energy_predictions[phoneme_index] = 0.0 + pitch_predictions = _scale_variance(pitch_predictions, pitch_variance_scale) + energy_predictions = _scale_variance(energy_predictions, energy_variance_scale) + + # use groundtruth to clone + pitch_embeddings = self.pitch_embed(gold_pitch.transpose(1, 2)).transpose(1, 2) + energy_embeddings = self.energy_embed(gold_energy.transpose(1, 2)).transpose(1, 2) + encoded_texts = encoded_texts + energy_embeddings + pitch_embeddings + encoded_texts = self.length_regulator(encoded_texts, gold_durations) # (B, Lmax, adim) + + # forward decoder + if speech_lens is not None and not is_inference: + if self.reduction_factor > 1: + olens_in = speech_lens.new([olen // self.reduction_factor for olen in speech_lens]) + else: + olens_in = speech_lens + h_masks = self._source_mask(olens_in) + else: + h_masks = None + zs, _ = self.decoder(encoded_texts, h_masks) # (B, Lmax, adim) + before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim) # (B, Lmax, odim) + + # postnet -> (B, Lmax//r * r, odim) + after_outs = before_outs + self.postnet(before_outs.transpose(1, 2)).transpose(1, 2) + + return before_outs, after_outs, duration_predictions, pitch_predictions, energy_predictions + + @torch.no_grad() + def forward(self, + text, + speech=None, + durations=None, + pitch=None, + energy=None, + utterance_embedding=None, + return_duration_pitch_energy=False, + lang_id=None, + duration_scaling_factor=1.0, + pitch_variance_scale=1.0, + energy_variance_scale=1.0): + """ + Generate the sequence of spectrogram frames given the sequence of vectorized phonemes. + + Args: + text: input sequence of vectorized phonemes + speech: feature sequence to extract style from (not used for now, placeholder for future plans) + durations: durations to be used (optional, if not provided, they will be predicted) + pitch: token-averaged pitch curve to be used (optional, if not provided, it will be predicted) + energy: token-averaged energy curve to be used (optional, if not provided, it will be predicted) + return_duration_pitch_energy: whether to return the list of predicted durations for nicer plotting + utterance_embedding: embedding of speaker information + lang_id: id to be fed into the embedding layer that contains language information + duration_scaling_factor: reasonable values are 0.8 < scale < 1.2. + 1.0 means no scaling happens, higher values increase durations for the whole + utterance, lower values decrease durations for the whole utterance. + pitch_variance_scale: reasonable values are 0.6 < scale < 1.4. + 1.0 means no scaling happens, higher values increase variance of the pitch curve, + lower values decrease variance of the pitch curve. + energy_variance_scale: reasonable values are 0.6 < scale < 1.4. + 1.0 means no scaling happens, higher values increase variance of the energy curve, + lower values decrease variance of the energy curve. + + Returns: + mel spectrogram + + """ + self.eval() + # setup batch axis + ilens = torch.tensor([text.shape[0]], dtype=torch.long, device=text.device) + if speech is not None: + gold_speech = speech.unsqueeze(0).to(text.device) + else: + gold_speech = None + if durations is not None: + durations = durations.unsqueeze(0).to(text.device) + if pitch is not None: + pitch = pitch.unsqueeze(0).to(text.device) + if energy is not None: + energy = energy.unsqueeze(0).to(text.device) + if lang_id is not None: + lang_id = lang_id.unsqueeze(0).to(text.device) + + before_outs, after_outs, d_outs, pitch_predictions, energy_predictions = self._forward(text.unsqueeze(0), + ilens, + gold_speech=gold_speech, + gold_durations=durations, + is_inference=True, + gold_pitch=pitch, + gold_energy=energy, + utterance_embedding=utterance_embedding.unsqueeze(0), + lang_ids=lang_id, + duration_scaling_factor=duration_scaling_factor, + pitch_variance_scale=pitch_variance_scale, + energy_variance_scale=energy_variance_scale) + self.train() + if return_duration_pitch_energy: + return after_outs[0], d_outs[0], pitch_predictions[0], energy_predictions[0] + return after_outs[0] + + def _source_mask(self, ilens): + x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device) + return x_masks.unsqueeze(-2) + + +def _scale_variance(sequence, scale): + if scale == 1.0: + return sequence + average = sequence[0][sequence[0] != 0.0].mean() + sequence = sequence - average # center sequence around 0 + sequence = sequence * scale # scale the variance + sequence = sequence + average # move center back to original with changed variance + for sequence_index in range(len(sequence[0])): + if sequence[0][sequence_index] < 0.0: + sequence[0][sequence_index] = 0.0 + return sequence diff --git a/anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/InferenceHiFiGAN.py b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/InferenceHiFiGAN.py new file mode 100644 index 0000000..6ada209 --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/InferenceHiFiGAN.py @@ -0,0 +1,91 @@ +import torch + +from ...Layers.ResidualBlock import HiFiGANResidualBlock as ResidualBlock + + +class HiFiGANGenerator(torch.nn.Module): + + def __init__(self, + path_to_weights, + in_channels=80, + out_channels=1, + channels=512, + kernel_size=7, + upsample_scales=(8, 6, 4, 4), + upsample_kernel_sizes=(16, 12, 8, 8), + resblock_kernel_sizes=(3, 7, 11), + resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)], + use_additional_convs=True, + bias=True, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.1}, + use_weight_norm=True, ): + super().__init__() + assert kernel_size % 2 == 1, "Kernal size must be odd number." + assert len(upsample_scales) == len(upsample_kernel_sizes) + assert len(resblock_dilations) == len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_kernel_sizes) + self.num_blocks = len(resblock_kernel_sizes) + self.input_conv = torch.nn.Conv1d(in_channels, + channels, + kernel_size, + 1, + padding=(kernel_size - 1) // 2, ) + self.upsamples = torch.nn.ModuleList() + self.blocks = torch.nn.ModuleList() + for i in range(len(upsample_kernel_sizes)): + self.upsamples += [ + torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), + torch.nn.ConvTranspose1d(channels // (2 ** i), + channels // (2 ** (i + 1)), + upsample_kernel_sizes[i], + upsample_scales[i], + padding=(upsample_kernel_sizes[i] - upsample_scales[i]) // 2, ), )] + for j in range(len(resblock_kernel_sizes)): + self.blocks += [ResidualBlock(kernel_size=resblock_kernel_sizes[j], + channels=channels // (2 ** (i + 1)), + dilations=resblock_dilations[j], + bias=bias, + use_additional_convs=use_additional_convs, + nonlinear_activation=nonlinear_activation, + nonlinear_activation_params=nonlinear_activation_params, )] + self.output_conv = torch.nn.Sequential( + torch.nn.LeakyReLU(), + torch.nn.Conv1d(channels // (2 ** (i + 1)), + out_channels, + kernel_size, + 1, + padding=(kernel_size - 1) // 2, ), + torch.nn.Tanh(), ) + if use_weight_norm: + self.apply_weight_norm() + self.load_state_dict(torch.load(path_to_weights, map_location='cpu')["generator"]) + + def forward(self, c, normalize_before=False): + if normalize_before: + c = (c - self.mean) / self.scale + c = self.input_conv(c.unsqueeze(0)) + for i in range(self.num_upsamples): + c = self.upsamples[i](c) + cs = 0.0 # initialize + for j in range(self.num_blocks): + cs = cs + self.blocks[i * self.num_blocks + j](c) + c = cs / self.num_blocks + c = self.output_conv(c) + return c.squeeze(0).squeeze(0) + + def remove_weight_norm(self): + def _remove_weight_norm(m): + try: + torch.nn.utils.remove_weight_norm(m) + except ValueError: + return + + self.apply(_remove_weight_norm) + + def apply_weight_norm(self): + def _apply_weight_norm(m): + if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d): + torch.nn.utils.weight_norm(m) + + self.apply(_apply_weight_norm) diff --git a/anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/__init__.py b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/InferenceArchitectures/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/anonymization/modules/tts/IMSToucan/InferenceInterfaces/__init__.py b/anonymization/modules/tts/IMSToucan/InferenceInterfaces/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/anonymization/modules/tts/IMSToucan/LICENSE b/anonymization/modules/tts/IMSToucan/LICENSE new file mode 100644 index 0000000..5e96996 --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/LICENSE @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright for most of the PyTorch modules: 2017 Johns Hopkins University (Shinji Watanabe) + Copyright for the rest: 2021 University of Stuttgart (Florian Lux) + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/anonymization/modules/tts/IMSToucan/Layers/Attention.py b/anonymization/modules/tts/IMSToucan/Layers/Attention.py new file mode 100644 index 0000000..7e1ee9a --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/Attention.py @@ -0,0 +1,324 @@ +# Written by Shigeki Karita, 2019 +# Published under Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# Adapted by Florian Lux, 2021 + +"""Multi-Head Attention layer definition.""" + +import math + +import numpy +import torch +from torch import nn + +from ..Utility.utils import make_non_pad_mask + + +class MultiHeadedAttention(nn.Module): + """ + Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + """ + + def __init__(self, n_head, n_feat, dropout_rate): + """ + Construct an MultiHeadedAttention object. + """ + super(MultiHeadedAttention, self).__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k = nn.Linear(n_feat, n_feat) + self.linear_v = nn.Linear(n_feat, n_feat) + self.linear_out = nn.Linear(n_feat, n_feat) + self.attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + def forward_qkv(self, query, key, value): + """ + Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + """ + n_batch = query.size(0) + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + + return q, k, v + + def forward_attention(self, value, scores, mask): + """ + Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + """ + n_batch = value.size(0) + if mask is not None: + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + min_value = float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min) + scores = scores.masked_fill(mask, min_value) + self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) # (batch, head, time1, time2) + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, query, key, value, mask): + """ + Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + """ + q, k, v = self.forward_qkv(query, key, value) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + return self.forward_attention(v, scores, mask) + + +class RelPositionMultiHeadedAttention(MultiHeadedAttention): + """ + Multi-Head Attention layer with relative position encoding. + Details can be found in https://github.com/espnet/espnet/pull/2816. + Paper: https://arxiv.org/abs/1901.02860 + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + """ + + def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_head, n_feat, dropout_rate) + self.zero_triu = zero_triu + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def rel_shift(self, x): + """ + Compute relative positional encoding. + Args: + x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1). + time1 means the length of query vector. + Returns: + torch.Tensor: Output tensor. + """ + zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x)[:, :, :, : x.size(-1) // 2 + 1] # only keep the positions from 0 to time2 + + if self.zero_triu: + ones = torch.ones((x.size(2), x.size(3)), device=x.device) + x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] + + return x + + def forward(self, query, key, value, pos_emb, mask): + """ + Compute 'Scaled Dot Product Attention' with rel. positional encoding. + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + pos_emb (torch.Tensor): Positional embedding tensor + (#batch, 2*time1-1, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + """ + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, time1, head, d_k) + + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, 2*time1-1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, 2*time1-1) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k) # (batch, head, time1, time2) + + return self.forward_attention(v, scores, mask) + + +class GuidedAttentionLoss(torch.nn.Module): + """ + Guided attention loss function module. + + This module calculates the guided attention loss described + in `Efficiently Trainable Text-to-Speech System Based + on Deep Convolutional Networks with Guided Attention`_, + which forces the attention to be diagonal. + + .. _`Efficiently Trainable Text-to-Speech System + Based on Deep Convolutional Networks with Guided Attention`: + https://arxiv.org/abs/1710.08969 + """ + + def __init__(self, sigma=0.4, alpha=1.0): + """ + Initialize guided attention loss module. + + Args: + sigma (float, optional): Standard deviation to control + how close attention to a diagonal. + alpha (float, optional): Scaling coefficient (lambda). + reset_always (bool, optional): Whether to always reset masks. + """ + super(GuidedAttentionLoss, self).__init__() + self.sigma = sigma + self.alpha = alpha + self.guided_attn_masks = None + self.masks = None + + def _reset_masks(self): + self.guided_attn_masks = None + self.masks = None + + def forward(self, att_ws, ilens, olens): + """ + Calculate forward propagation. + + Args: + att_ws (Tensor): Batch of attention weights (B, T_max_out, T_max_in). + ilens (LongTensor): Batch of input lenghts (B,). + olens (LongTensor): Batch of output lenghts (B,). + + Returns: + Tensor: Guided attention loss value. + """ + self._reset_masks() + self.guided_attn_masks = self._make_guided_attention_masks(ilens, olens).to(att_ws.device) + self.masks = self._make_masks(ilens, olens).to(att_ws.device) + losses = self.guided_attn_masks * att_ws + loss = torch.mean(losses.masked_select(self.masks)) + self._reset_masks() + return self.alpha * loss + + def _make_guided_attention_masks(self, ilens, olens): + n_batches = len(ilens) + max_ilen = max(ilens) + max_olen = max(olens) + guided_attn_masks = torch.zeros((n_batches, max_olen, max_ilen), device=ilens.device) + for idx, (ilen, olen) in enumerate(zip(ilens, olens)): + guided_attn_masks[idx, :olen, :ilen] = self._make_guided_attention_mask(ilen, olen, self.sigma) + return guided_attn_masks + + @staticmethod + def _make_guided_attention_mask(ilen, olen, sigma): + """ + Make guided attention mask. + """ + grid_x, grid_y = torch.meshgrid(torch.arange(olen, device=olen.device).float(), torch.arange(ilen, device=ilen.device).float()) + return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma ** 2))) + + @staticmethod + def _make_masks(ilens, olens): + """ + Make masks indicating non-padded part. + + Args: + ilens (LongTensor or List): Batch of lengths (B,). + olens (LongTensor or List): Batch of lengths (B,). + + Returns: + Tensor: Mask tensor indicating non-padded part. + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (including 1.2) + """ + in_masks = make_non_pad_mask(ilens, device=ilens.device) # (B, T_in) + out_masks = make_non_pad_mask(olens, device=olens.device) # (B, T_out) + return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2) # (B, T_out, T_in) + + +class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): + """ + Guided attention loss function module for multi head attention. + + Args: + sigma (float, optional): Standard deviation to control + how close attention to a diagonal. + alpha (float, optional): Scaling coefficient (lambda). + reset_always (bool, optional): Whether to always reset masks. + """ + + def forward(self, att_ws, ilens, olens): + """ + Calculate forward propagation. + + Args: + att_ws (Tensor): + Batch of multi head attention weights (B, H, T_max_out, T_max_in). + ilens (LongTensor): Batch of input lenghts (B,). + olens (LongTensor): Batch of output lenghts (B,). + + Returns: + Tensor: Guided attention loss value. + """ + if self.guided_attn_masks is None: + self.guided_attn_masks = (self._make_guided_attention_masks(ilens, olens).to(att_ws.device).unsqueeze(1)) + if self.masks is None: + self.masks = self._make_masks(ilens, olens).to(att_ws.device).unsqueeze(1) + losses = self.guided_attn_masks * att_ws + loss = torch.mean(losses.masked_select(self.masks)) + if self.reset_always: + self._reset_masks() + + return self.alpha * loss diff --git a/anonymization/modules/tts/IMSToucan/Layers/Conformer.py b/anonymization/modules/tts/IMSToucan/Layers/Conformer.py new file mode 100644 index 0000000..f62f158 --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/Conformer.py @@ -0,0 +1,128 @@ +""" +Taken from ESPNet +""" + +import torch + +from .Attention import RelPositionMultiHeadedAttention +from .Convolution import ConvolutionModule +from .EncoderLayer import EncoderLayer +from .LayerNorm import LayerNorm +from .MultiLayeredConv1d import MultiLayeredConv1d +from .MultiSequential import repeat +from .PositionalEncoding import RelPositionalEncoding +from .Swish import Swish + + +class Conformer(torch.nn.Module): + """ + Conformer encoder module. + + Args: + idim (int): Input dimension. + attention_dim (int): Dimension of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, torch.nn.Module]): Input layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + macaron_style (bool): Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): Conformer positional encoding layer type. + selfattention_layer_type (str): Conformer attention layer type. + activation_type (str): Conformer activation function type. + use_cnn_module (bool): Whether to use convolution module. + cnn_module_kernel (int): Kernel size of convolution module. + padding_idx (int): Padding idx for input_layer=embed. + + """ + + def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, + attention_dropout_rate=0.0, input_layer="conv2d", normalize_before=True, concat_after=False, positionwise_conv_kernel_size=1, + macaron_style=False, use_cnn_module=False, cnn_module_kernel=31, zero_triu=False, utt_embed=None, + lang_embs=None): + super(Conformer, self).__init__() + + activation = Swish() + self.conv_subsampling_factor = 1 + + if isinstance(input_layer, torch.nn.Module): + self.embed = input_layer + self.pos_enc = RelPositionalEncoding(attention_dim, positional_dropout_rate) + elif input_layer is None: + self.embed = None + self.pos_enc = torch.nn.Sequential(RelPositionalEncoding(attention_dim, positional_dropout_rate)) + else: + raise ValueError("unknown input_layer: " + input_layer) + + self.normalize_before = normalize_before + if utt_embed is not None: + self.embedding_expansion = torch.nn.Linear(utt_embed, attention_dim) + if lang_embs is not None: + self.language_embedding = torch.nn.Embedding(num_embeddings=lang_embs, embedding_dim=attention_dim) + + # self-attention module definition + encoder_selfattn_layer = RelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu) + + # feed-forward module definition + positionwise_layer = MultiLayeredConv1d + positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate,) + + # convolution module definition + convolution_layer = ConvolutionModule + convolution_layer_args = (attention_dim, cnn_module_kernel, activation) + + self.encoders = repeat(num_blocks, lambda lnum: EncoderLayer(attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + positionwise_layer(*positionwise_layer_args) if macaron_style else None, + convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, + normalize_before, concat_after)) + if self.normalize_before: + self.after_norm = LayerNorm(attention_dim) + + def forward(self, xs, masks, utterance_embedding=None, lang_ids=None): + """ + Encode input sequence. + Args: + utterance_embedding: embedding containing lots of conditioning signals + step: indicator for when to start updating the embedding function + xs (torch.Tensor): Input tensor (#batch, time, idim). + masks (torch.Tensor): Mask tensor (#batch, time). + Returns: + torch.Tensor: Output tensor (#batch, time, attention_dim). + torch.Tensor: Mask tensor (#batch, time). + """ + + if self.embed is not None: + xs = self.embed(xs) + + if lang_ids is not None: + lang_embs = self.language_embedding(lang_ids) + xs = xs + lang_embs # offset the phoneme distribution of a language + + xs = self.pos_enc(xs) + + xs, masks = self.encoders(xs, masks) + if isinstance(xs, tuple): + xs = xs[0] + + if self.normalize_before: + xs = self.after_norm(xs) + + if utterance_embedding is not None: + xs = self._integrate_with_utt_embed_encoder(xs, utterance_embedding) + + return xs, masks + + def _integrate_with_utt_embed_encoder(self, hs, utt_embeddings): + expanded_embeddings = self.embedding_expansion(utt_embeddings).unsqueeze(1) + return hs + expanded_embeddings diff --git a/anonymization/modules/tts/IMSToucan/Layers/Convolution.py b/anonymization/modules/tts/IMSToucan/Layers/Convolution.py new file mode 100644 index 0000000..e6e56e8 --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/Convolution.py @@ -0,0 +1,55 @@ +# Copyright 2020 Johns Hopkins University (Shinji Watanabe) +# Northwestern Polytechnical University (Pengcheng Guo) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# Adapted by Florian Lux 2021 + + +from torch import nn + + +class ConvolutionModule(nn.Module): + """ + ConvolutionModule in Conformer model. + + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernel size of conv layers. + + """ + + def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): + super(ConvolutionModule, self).__init__() + # kernel_size should be an odd number for 'SAME' padding + assert (kernel_size - 1) % 2 == 0 + + self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias, ) + self.depthwise_conv = nn.Conv1d(channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=bias, ) + self.norm = nn.GroupNorm(num_groups=32, num_channels=channels) + self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=bias, ) + self.activation = activation + + def forward(self, x): + """ + Compute convolution module. + + Args: + x (torch.Tensor): Input tensor (#batch, time, channels). + + Returns: + torch.Tensor: Output tensor (#batch, time, channels). + + """ + # exchange the temporal dimension and the feature dimension + x = x.transpose(1, 2) + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channel, dim) + x = nn.functional.glu(x, dim=1) # (batch, channel, dim) + + # 1D Depthwise Conv + x = self.depthwise_conv(x) + x = self.activation(self.norm(x)) + + x = self.pointwise_conv2(x) + + return x.transpose(1, 2) diff --git a/anonymization/modules/tts/IMSToucan/Layers/DurationPredictor.py b/anonymization/modules/tts/IMSToucan/Layers/DurationPredictor.py new file mode 100644 index 0000000..d779dc0 --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/DurationPredictor.py @@ -0,0 +1,139 @@ +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) +# Adapted by Florian Lux 2021 + + +import torch + +from .LayerNorm import LayerNorm + + +class DurationPredictor(torch.nn.Module): + """ + Duration predictor module. + + This is a module of duration predictor described + in `FastSpeech: Fast, Robust and Controllable Text to Speech`_. + The duration predictor predicts a duration of each frame in log domain + from the hidden embeddings of encoder. + + .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: + https://arxiv.org/pdf/1905.09263.pdf + + Note: + The calculation domain of outputs is different + between in `forward` and in `inference`. In `forward`, + the outputs are calculated in log domain but in `inference`, + those are calculated in linear domain. + + """ + + def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0): + """ + Initialize duration predictor module. + + Args: + idim (int): Input dimension. + n_layers (int, optional): Number of convolutional layers. + n_chans (int, optional): Number of channels of convolutional layers. + kernel_size (int, optional): Kernel size of convolutional layers. + dropout_rate (float, optional): Dropout rate. + offset (float, optional): Offset value to avoid nan in log domain. + + """ + super(DurationPredictor, self).__init__() + self.offset = offset + self.conv = torch.nn.ModuleList() + for idx in range(n_layers): + in_chans = idim if idx == 0 else n_chans + self.conv += [torch.nn.Sequential(torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ), torch.nn.ReLU(), + LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate), )] + self.linear = torch.nn.Linear(n_chans, 1) + + def _forward(self, xs, x_masks=None, is_inference=False): + xs = xs.transpose(1, -1) # (B, idim, Tmax) + for f in self.conv: + xs = f(xs) # (B, C, Tmax) + + # NOTE: calculate in log domain + xs = self.linear(xs.transpose(1, -1)).squeeze(-1) # (B, Tmax) + + if is_inference: + # NOTE: calculate in linear domain + xs = torch.clamp(torch.round(xs.exp() - self.offset), min=0).long() # avoid negative value + + if x_masks is not None: + xs = xs.masked_fill(x_masks, 0.0) + + return xs + + def forward(self, xs, x_masks=None): + """ + Calculate forward propagation. + + Args: + xs (Tensor): Batch of input sequences (B, Tmax, idim). + x_masks (ByteTensor, optional): + Batch of masks indicating padded part (B, Tmax). + + Returns: + Tensor: Batch of predicted durations in log domain (B, Tmax). + + """ + return self._forward(xs, x_masks, False) + + def inference(self, xs, x_masks=None): + """ + Inference duration. + + Args: + xs (Tensor): Batch of input sequences (B, Tmax, idim). + x_masks (ByteTensor, optional): + Batch of masks indicating padded part (B, Tmax). + + Returns: + LongTensor: Batch of predicted durations in linear domain (B, Tmax). + + """ + return self._forward(xs, x_masks, True) + + +class DurationPredictorLoss(torch.nn.Module): + """ + Loss function module for duration predictor. + + The loss value is Calculated in log domain to make it Gaussian. + + """ + + def __init__(self, offset=1.0, reduction="mean"): + """ + Args: + offset (float, optional): Offset value to avoid nan in log domain. + reduction (str): Reduction type in loss calculation. + + """ + super(DurationPredictorLoss, self).__init__() + self.criterion = torch.nn.MSELoss(reduction=reduction) + self.offset = offset + + def forward(self, outputs, targets): + """ + Calculate forward propagation. + + Args: + outputs (Tensor): Batch of prediction durations in log domain (B, T) + targets (LongTensor): Batch of groundtruth durations in linear domain (B, T) + + Returns: + Tensor: Mean squared error loss value. + + Note: + `outputs` is in log domain but `targets` is in linear domain. + + """ + # NOTE: outputs is in log domain while targets in linear + targets = torch.log(targets.float() + self.offset) + loss = self.criterion(outputs, targets) + + return loss diff --git a/anonymization/modules/tts/IMSToucan/Layers/EncoderLayer.py b/anonymization/modules/tts/IMSToucan/Layers/EncoderLayer.py new file mode 100644 index 0000000..4553aec --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/EncoderLayer.py @@ -0,0 +1,144 @@ +# Copyright 2020 Johns Hopkins University (Shinji Watanabe) +# Northwestern Polytechnical University (Pengcheng Guo) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# Adapted by Florian Lux 2021 + + +import torch +from torch import nn + +from .LayerNorm import LayerNorm + + +class EncoderLayer(nn.Module): + """ + Encoder layer module. + + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance + can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + conv_module (torch.nn.Module): Convolution module instance. + `ConvlutionModule` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + + """ + + def __init__(self, size, self_attn, feed_forward, feed_forward_macaron, conv_module, dropout_rate, normalize_before=True, concat_after=False, ): + super(EncoderLayer, self).__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.feed_forward_macaron = feed_forward_macaron + self.conv_module = conv_module + self.norm_ff = LayerNorm(size) # for the FNN module + self.norm_mha = LayerNorm(size) # for the MHA module + if feed_forward_macaron is not None: + self.norm_ff_macaron = LayerNorm(size) + self.ff_scale = 0.5 + else: + self.ff_scale = 1.0 + if self.conv_module is not None: + self.norm_conv = LayerNorm(size) # for the CNN module + self.norm_final = LayerNorm(size) # for the final output of the block + self.dropout = nn.Dropout(dropout_rate) + self.size = size + self.normalize_before = normalize_before + self.concat_after = concat_after + if self.concat_after: + self.concat_linear = nn.Linear(size + size, size) + + def forward(self, x_input, mask, cache=None): + """ + Compute encoded features. + + Args: + x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb. + - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. + - w/o pos emb: Tensor (#batch, time, size). + mask (torch.Tensor): Mask tensor for the input (#batch, time). + cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time). + + """ + if isinstance(x_input, tuple): + x, pos_emb = x_input[0], x_input[1] + else: + x, pos_emb = x_input, None + + # whether to use macaron style + if self.feed_forward_macaron is not None: + residual = x + if self.normalize_before: + x = self.norm_ff_macaron(x) + x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x)) + if not self.normalize_before: + x = self.norm_ff_macaron(x) + + # multi-headed self-attention module + residual = x + if self.normalize_before: + x = self.norm_mha(x) + + if cache is None: + x_q = x + else: + assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) + x_q = x[:, -1:, :] + residual = residual[:, -1:, :] + mask = None if mask is None else mask[:, -1:, :] + + if pos_emb is not None: + x_att = self.self_attn(x_q, x, x, pos_emb, mask) + else: + x_att = self.self_attn(x_q, x, x, mask) + + if self.concat_after: + x_concat = torch.cat((x, x_att), dim=-1) + x = residual + self.concat_linear(x_concat) + else: + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.norm_mha(x) + + # convolution module + if self.conv_module is not None: + residual = x + if self.normalize_before: + x = self.norm_conv(x) + x = residual + self.dropout(self.conv_module(x)) + if not self.normalize_before: + x = self.norm_conv(x) + + # feed forward module + residual = x + if self.normalize_before: + x = self.norm_ff(x) + x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm_ff(x) + + if self.conv_module is not None: + x = self.norm_final(x) + + if cache is not None: + x = torch.cat([cache, x], dim=1) + + if pos_emb is not None: + return (x, pos_emb), mask + + return x, mask diff --git a/anonymization/modules/tts/IMSToucan/Layers/LayerNorm.py b/anonymization/modules/tts/IMSToucan/Layers/LayerNorm.py new file mode 100644 index 0000000..c4cb4c1 --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/LayerNorm.py @@ -0,0 +1,36 @@ +# Written by Shigeki Karita, 2019 +# Published under Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# Adapted by Florian Lux, 2021 + +import torch + + +class LayerNorm(torch.nn.LayerNorm): + """ + Layer normalization module. + + Args: + nout (int): Output dim size. + dim (int): Dimension to be normalized. + """ + + def __init__(self, nout, dim=-1): + """ + Construct an LayerNorm object. + """ + super(LayerNorm, self).__init__(nout, eps=1e-12) + self.dim = dim + + def forward(self, x): + """ + Apply layer normalization. + + Args: + x (torch.Tensor): Input tensor. + + Returns: + torch.Tensor: Normalized tensor. + """ + if self.dim == -1: + return super(LayerNorm, self).forward(x) + return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1) diff --git a/anonymization/modules/tts/IMSToucan/Layers/LengthRegulator.py b/anonymization/modules/tts/IMSToucan/Layers/LengthRegulator.py new file mode 100644 index 0000000..f9ae4e0 --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/LengthRegulator.py @@ -0,0 +1,62 @@ +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) +# Adapted by Florian Lux 2021 + +from abc import ABC + +import torch + +from ..Utility.utils import pad_list + + +class LengthRegulator(torch.nn.Module, ABC): + """ + Length regulator module for feed-forward Transformer. + + This is a module of length regulator described in + `FastSpeech: Fast, Robust and Controllable Text to Speech`_. + The length regulator expands char or + phoneme-level embedding features to frame-level by repeating each + feature based on the corresponding predicted durations. + + .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: + https://arxiv.org/pdf/1905.09263.pdf + + """ + + def __init__(self, pad_value=0.0): + """ + Initialize length regulator module. + + Args: + pad_value (float, optional): Value used for padding. + """ + super(LengthRegulator, self).__init__() + self.pad_value = pad_value + + def forward(self, xs, ds, alpha=1.0): + """ + Calculate forward propagation. + + Args: + xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). + ds (LongTensor): Batch of durations of each frame (B, T). + alpha (float, optional): Alpha value to control speed of speech. + + Returns: + Tensor: replicated input tensor based on durations (B, T*, D). + """ + if alpha != 1.0: + assert alpha > 0 + ds = torch.round(ds.float() * alpha).long() + + if ds.sum() == 0: + ds[ds.sum(dim=1).eq(0)] = 1 + + return pad_list([self._repeat_one_sequence(x, d) for x, d in zip(xs, ds)], self.pad_value) + + def _repeat_one_sequence(self, x, d): + """ + Repeat each frame according to duration + """ + return torch.repeat_interleave(x, d, dim=0) diff --git a/anonymization/modules/tts/IMSToucan/Layers/MultiLayeredConv1d.py b/anonymization/modules/tts/IMSToucan/Layers/MultiLayeredConv1d.py new file mode 100644 index 0000000..f2de4a0 --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/MultiLayeredConv1d.py @@ -0,0 +1,87 @@ +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) +# Adapted by Florian Lux 2021 + +""" +Layer modules for FFT block in FastSpeech (Feed-forward Transformer). +""" + +import torch + + +class MultiLayeredConv1d(torch.nn.Module): + """ + Multi-layered conv1d for Transformer block. + + This is a module of multi-layered conv1d designed + to replace positionwise feed-forward network + in Transformer block, which is introduced in + `FastSpeech: Fast, Robust and Controllable Text to Speech`_. + + .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: + https://arxiv.org/pdf/1905.09263.pdf + """ + + def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): + """ + Initialize MultiLayeredConv1d module. + + Args: + in_chans (int): Number of input channels. + hidden_chans (int): Number of hidden channels. + kernel_size (int): Kernel size of conv1d. + dropout_rate (float): Dropout rate. + """ + super(MultiLayeredConv1d, self).__init__() + self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ) + self.w_2 = torch.nn.Conv1d(hidden_chans, in_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ) + self.dropout = torch.nn.Dropout(dropout_rate) + + def forward(self, x): + """ + Calculate forward propagation. + + Args: + x (torch.Tensor): Batch of input tensors (B, T, in_chans). + + Returns: + torch.Tensor: Batch of output tensors (B, T, hidden_chans). + """ + x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1) + return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1) + + +class Conv1dLinear(torch.nn.Module): + """ + Conv1D + Linear for Transformer block. + + A variant of MultiLayeredConv1d, which replaces second conv-layer to linear. + """ + + def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): + """ + Initialize Conv1dLinear module. + + Args: + in_chans (int): Number of input channels. + hidden_chans (int): Number of hidden channels. + kernel_size (int): Kernel size of conv1d. + dropout_rate (float): Dropout rate. + """ + super(Conv1dLinear, self).__init__() + self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ) + self.w_2 = torch.nn.Linear(hidden_chans, in_chans) + self.dropout = torch.nn.Dropout(dropout_rate) + + def forward(self, x): + """ + Calculate forward propagation. + + Args: + x (torch.Tensor): Batch of input tensors (B, T, in_chans). + + Returns: + torch.Tensor: Batch of output tensors (B, T, hidden_chans). + """ + x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1) + return self.w_2(self.dropout(x)) diff --git a/anonymization/modules/tts/IMSToucan/Layers/MultiSequential.py b/anonymization/modules/tts/IMSToucan/Layers/MultiSequential.py new file mode 100644 index 0000000..bccf8cd --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/MultiSequential.py @@ -0,0 +1,33 @@ +# Written by Shigeki Karita, 2019 +# Published under Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# Adapted by Florian Lux, 2021 + +import torch + + +class MultiSequential(torch.nn.Sequential): + """ + Multi-input multi-output torch.nn.Sequential. + """ + + def forward(self, *args): + """ + Repeat. + """ + for m in self: + args = m(*args) + return args + + +def repeat(N, fn): + """ + Repeat module N times. + + Args: + N (int): Number of repeat time. + fn (Callable): Function to generate module. + + Returns: + MultiSequential: Repeated model instance. + """ + return MultiSequential(*[fn(n) for n in range(N)]) diff --git a/anonymization/modules/tts/IMSToucan/Layers/PositionalEncoding.py b/anonymization/modules/tts/IMSToucan/Layers/PositionalEncoding.py new file mode 100644 index 0000000..8929a7f --- /dev/null +++ b/anonymization/modules/tts/IMSToucan/Layers/PositionalEncoding.py @@ -0,0 +1,166 @@ +""" +Taken from ESPNet +""" + +import math + +import torch + + +class PositionalEncoding(torch.nn.Module): + """ + Positional encoding. + + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + reverse (bool): Whether to reverse the input position. + """ + + def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False): + """ + Construct an PositionalEncoding object. + """ + super(PositionalEncoding, self).__init__() + self.d_model = d_model + self.reverse = reverse + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0, device=d_model.device).expand(1, max_len)) + + def extend_pe(self, x): + """ + Reset the positional encodings. + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + if self.reverse: + position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1) + else: + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp(torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x): + """ + Add positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). + + Returns: + torch.Tensor: Encoded tensor (batch, time, `*`). + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class RelPositionalEncoding(torch.nn.Module): + """ + Relative positional encoding module (new implementation). + Details can be found in https://github.com/espnet/espnet/pull/2816. + See : Appendix B in https://arxiv.org/abs/1901.02860 + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + """ + + def __init__(self, d_model, dropout_rate, max_len=5000): + """ + Construct an PositionalEncoding object. + """ + super(RelPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vecotr and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i