forked from NVIDIA/NeMo
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathself_vc_notes.txt
114 lines (101 loc) · 5.08 KB
/
self_vc_notes.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
pip install praat-parselmouth
# Training Val Manifests:
Train: wget https://expressivecloning.s3.us-east-2.amazonaws.com/libri_train_selfvc.json
Val: wget https://expressivecloning.s3.us-east-2.amazonaws.com/libri_val_selfvc.json
# Pretrained checkpoints
Conformer: https://expressivecloning.s3.us-east-2.amazonaws.com/SelfVC_Checkpoints/ConformerSSL_LibriCSS.ckpt
FastPitch: https://expressivecloning.s3.us-east-2.amazonaws.com/SelfVC_Checkpoints/FastPitch_LibriCSS.ckpt
HiFiGAN: https://expressivecloning.s3.us-east-2.amazonaws.com/SelfVC_Checkpoints/HiFiFinetuned.ckpt
# Step 1: Conformer Training Command
python examples/asr/speech_pretraining/speech_pre_training.py \
model.train_ds.manifest_filepath="/datap/LibriTTS/libri_val_selfvc.json" \
model.validation_ds.manifest_filepath="/datap/LibriTTS/libri_val_selfvc.json" \
exp_manager.resume_if_exists=true \
exp_manager.resume_ignore_no_checkpoint=true \
trainer.devices=-1 \
trainer.accelerator="gpu" \
trainer.max_epochs=100 \
model.optim.name="adamw" \
model.optim.lr=0.00005 \
model.optim.betas=[0.9,0.999] \
model.optim.weight_decay=0.0001 \
model.train_ds.min_duration=3.0 \
model.validation_ds.min_duration=3.0 \
~model.optim.sched \
model.train_ds.batch_size=4 \
model.validation_ds.batch_size=4 \
model.encoder.feat_out=256 \
model.loss_list.contrastive.loss.num_negatives=20 \
exp_manager.exp_dir="/datap/SelfVCRoughExperiments/PretrainingExperiments" \
exp_manager.name="conformerSSL" \
--config-path="/home/pneekhara/2023/NemoSelfVC/NeMo/examples/asr/conf/ssl/conformer/" \
--config-name="conformer_ssl_22050_selfvc" ;
# Step 2: Create Sup Data - Extract conformer embeddings, speaker embeddings and pitch contours
# NOTE: Sometimes the script hangs while extracting the pitch contour. In that case,
# rerun the script and pass --extract_only_pitch_contours=1 .
# Pass augment_embeddings=0 if you don't want embeddings of transformed
audio (augmented embeddings are used only in SelfVC for voice conversion).
# Would need to install parselmouth for augment_embeddings. Also quite slow with augment_embeddings=1
# This script would create files in sup_data_dir
python scripts/ssl_tts/make_supdata_self_vc.py \
--ssl_model_ckpt_path /datap/misc/SelfVC_Checkpoints/ConformerICLR.ckpt \
--manifest_paths "/datap/LibriTTS/libri_val_selfvc.json" \
--sup_data_dir "/datap/SupDataDirs/SelfVC_LibriValSupData" \
--batch_size 32 \
--augment_embeddings 1 \
--compute_pitch_contours 1 \
--extract_only_pitch_contours 0 \
--pool_workers 8 \
--num_workers 16 ;
# Step 3: FastPitch training
# Pass content_aug_types='[]' to not use augmentations
# Remove +model.self_converted_aug_start_step=100000 if you do not want self transformations (only used in voice conversion)
# hifi_ckpt_path is used to log audio during training
python examples/tts/fastpitch_ssl.py \
pitch_mean=212.35 \
pitch_std=38.75 \
use_unique_tokens=true \
content_aug_types='["f_transform", "g_transform"]' \
model.emb_similarity_threshold=0.925 \
train_dataset="/datap/LibriTTS/libri_val_selfvc.json" \
validation_datasets="/datap/LibriTTS/libri_val_selfvc.json" \
ssl_model_ckpt_path="/datap/misc/SelfVC_Checkpoints/ConformerICLR.ckpt" \
hifi_ckpt_path="/datap/misc/SelfVC_Checkpoints/HiFiGANICLR.ckpt" \
model.train_ds.dataloader_params.batch_size=32 \
model.validation_ds.dataloader_params.batch_size=32 \
exp_manager.exp_dir="/datap/SelfVCRoughExperiments" \
exp_manager.name="Run1FromScratch" \
model.n_datasets=1 \
sup_data_dir="/datap/SupDataDirs/SelfVC_LibriValSupData" \
ssl_content_emb_type="embedding" \
model.content_emb_indim=256 \
model.content_emb_outdim=256 \
model.speaker_emb_indim=192 \
model.speaker_emb_outdim=256 \
model.symbols_embedding_dim=512 \
model.optim.lr=0.00007 \
+model.self_converted_aug_start_step=100000 \
trainer.max_epochs=500 ;
# INFERENCE
---------------------------------------------------------------------
# inference script also supports batched inference,
# Can pass source_target_out_pairs.txt instead of source_audio_path, target_audio_path and out_path
# Each line in source_target_out_pairs.txt is formatted as source.wav;target.wav;out.wav
# Sample Source Wav: https://expressivecloning.s3.us-east-2.amazonaws.com/AMTEVAL/SRC_UNSEEN_LIBRI/source_30.wav
# Sample Target Wav: https://expressivecloning.s3.us-east-2.amazonaws.com/AMTEVAL/TARGET_UNSEEN_LIBRI/targetspeaker_29.wav
python scripts/ssl_tts/self_vc.py \
--source_audio_path source_audio.wav \
--target_audio_path target_speaker_audio.wav \
--out_path out.wav \
--fastpitch_ckpt_path SelfVC_Checkpoints/FastPitch_LibriCSS.ckpt \
--ssl_model_ckpt_path SelfVC_Checkpoints/ConformerSSL_LibriCSS.ckpt \
--hifi_ckpt_path SelfVC_Checkpoints/HiFiFinetuned.ckpt \
--compute_pitch 1 --compute_duration 0 --use_unique_tokens=0 ;
python scripts/ssl_tts/self_vc.py \
--source_audio_path /datap/source_30.wav \
--target_audio_path /datap/targetspeaker_29.wav \
--out_path out.wav \
--fastpitch_ckpt_path /datap/misc/SelfVC_Checkpoints/FastPitchICLR.ckpt \
--ssl_model_ckpt_path /datap/misc/SelfVC_Checkpoints/ConformerICLR.ckpt \
--hifi_ckpt_path /datap/misc/SelfVC_Checkpoints/HiFiGANICLR.ckpt \
--compute_pitch 1 --compute_duration 0 --use_unique_tokens=0 ;