From 7231aca5890536f8f5a26e8a12309a7a4ef58570 Mon Sep 17 00:00:00 2001 From: Oktai Tatanov Date: Wed, 16 Feb 2022 19:23:25 +0300 Subject: [PATCH] [TTS] Update TTS tutorials, Simplification of testing Mixer-TTS and FastPitch (#3680) * update notebooks Signed-off-by: Oktai Tatanov * small fix in FastPitch_Finetuning.ipynb Signed-off-by: Oktai Tatanov * update notebooks Signed-off-by: Oktai Tatanov * fix in Inference_ModelSelect.ipynb Signed-off-by: Oktai Tatanov * fix librosa Signed-off-by: Oktai Tatanov * fix style Signed-off-by: Oktai Tatanov * update jenkinsfile, remove unnecessary line in fastpitch Signed-off-by: Oktai Tatanov --- Jenkinsfile | 8 +- nemo/collections/tts/models/fastpitch.py | 2 - nemo/collections/tts/torch/data.py | 2 +- tutorials/tts/FastPitch_Finetuning.ipynb | 211 ++++++++++++------ .../tts/FastPitch_MixerTTS_Training.ipynb | 22 +- .../tts/Inference_DurationPitchControl.ipynb | 6 +- tutorials/tts/Inference_ModelSelect.ipynb | 6 +- tutorials/tts/Tacotron2_Training.ipynb | 6 +- tutorials/tts/TalkNet_Training.ipynb | 6 +- 9 files changed, 170 insertions(+), 99 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1223b7e1e640..296074445193 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2304,7 +2304,9 @@ pipeline { model.input_fft.n_layer=2 \ model.output_fft.d_inner=384 \ model.output_fft.n_layer=2 \ - ~trainer.check_val_every_n_epoch' + ~trainer.check_val_every_n_epoch \ + ~model.text_normalizer \ + ~model.text_normalizer_call_kwargs' } } stage('Mixer-TTS') { @@ -2320,7 +2322,9 @@ pipeline { model.train_ds.dataloader_params.num_workers=1 \ model.validation_ds.dataloader_params.batch_size=4 \ model.validation_ds.dataloader_params.num_workers=1 \ - ~trainer.check_val_every_n_epoch' + ~trainer.check_val_every_n_epoch \ + ~model.text_normalizer \ + ~model.text_normalizer_call_kwargs' } } stage('Hifigan') { diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 02d688044232..e48b018d2c90 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -197,8 +197,6 @@ def parser(self): def parse(self, str_input: str, normalize=True) -> torch.tensor: if self.training: logging.warning("parse() is meant to be called in eval mode.") - if str_input[-1] not in [".", "!", "?"]: - str_input = str_input + "." if normalize and self.text_normalizer_call is not None: str_input = self.text_normalizer_call(str_input, **self.text_normalizer_call_kwargs) diff --git a/nemo/collections/tts/torch/data.py b/nemo/collections/tts/torch/data.py index d9e6a8589b92..f9001d883c48 100644 --- a/nemo/collections/tts/torch/data.py +++ b/nemo/collections/tts/torch/data.py @@ -736,7 +736,7 @@ def __init__( json. Each line should contain the following: "audio_filepath": , "duration": (Optional), - "mel_filepath": (Optional) + "mel_filepath": (Optional, can be in .npy (numpy.save) or .pt (torch.save) format) sample_rate (int): The sample rate of the audio. Or the sample rate that we will resample all files to. n_segments (int): The length of audio in samples to load. For example, given a sample rate of 16kHz, and n_segments=16000, a random 1 second section of audio from the clip will be loaded. The section will diff --git a/tutorials/tts/FastPitch_Finetuning.ipynb b/tutorials/tts/FastPitch_Finetuning.ipynb index 1aa2cc463642..cfe4bfaa69d8 100755 --- a/tutorials/tts/FastPitch_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Finetuning.ipynb @@ -61,7 +61,7 @@ "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode\n", - "# !python -m pip install git+https://github.com/NeMo/NeMo.git@$BRANCH#egg=nemo_toolkit[tts]" + "# !python -m pip install git+https://github.com/NeMo/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, { @@ -71,8 +71,7 @@ "id": "2502cf61" }, "source": [ - "## Downloading Data\n", - "___" + "## Downloading data" ] }, { @@ -82,9 +81,7 @@ "id": "81fa2c02" }, "source": [ - "Download and untar the data.\n", - "\n", - "The data contains a 5 minute subset of audio from speaker 6097 from the HiFiTTS dataset." + "For our tutorial, we will use small part of Hi-Fi Multi-Speaker English TTS (Hi-Fi TTS) dataset. You can read more about dataset [here](https://arxiv.org/abs/2104.01497). As a target speaker, will use speaker whose id is 6097 and only 5 minute subset of audio will be used. We additionally resampled audios to 22050 kHz." ] }, { @@ -107,7 +104,7 @@ "id": "gSQqq0fBqy8K" }, "source": [ - "Looking at manifest.json, we see a standard NeMo json that contains the filepath, text, and duration. Please note that manifest.json only contains the relative path.\n", + "Looking at `manifest.json`, we see a standard NeMo json that contains the filepath, text, and duration. Please note that our `manifest.json` contains the relative path.\n", "\n", "```\n", "{\"audio_filepath\": \"audio/presentpictureofnsw_02_mann_0532.wav\", \"text\": \"not to stop more than ten minutes by the way\", \"duration\": 2.6, \"text_no_preprocessing\": \"not to stop more than ten minutes by the way,\", \"text_normalized\": \"not to stop more than ten minutes by the way,\"}\n", @@ -130,18 +127,6 @@ "!ln -s ./6097_5_mins/audio audio" ] }, - { - "cell_type": "markdown", - "id": "779af190", - "metadata": { - "id": "ef75d1d5" - }, - "source": [ - "## Finetuning FastPitch\n", - "___\n", - "\n" - ] - }, { "cell_type": "markdown", "id": "bf1f9c46", @@ -149,7 +134,7 @@ "id": "lhhg2wBNtW0r" }, "source": [ - "Let's first download the pretrained checkpoint that we want to finetune from. NeMo will save checkpoints to ~/.cache, so let's move that to our current directory. \n", + "Let's also download the pretrained checkpoint that we want to finetune from. NeMo will save checkpoints to `~/.cache`, so let's move that to our current directory. \n", "\n", "*Note: please, check that `home_path` refers to your home folder. Otherwise, change it manually.*" ] @@ -200,7 +185,7 @@ "id": "6c8b13b8" }, "source": [ - "To finetune the FastPitch model on the above created filelists, we use `examples/tts/fastpitch_finetune.py` script to train the models with the `fastpitch_align.yaml` configuration.\n", + "To finetune the FastPitch model on the above created filelists, we use `examples/tts/fastpitch_finetune.py` script to train the models with the `fastpitch_align_v1.05.yaml` configuration.\n", "\n", "Let's grab those files." ] @@ -215,7 +200,44 @@ "outputs": [], "source": [ "!wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/fastpitch_finetune.py\n", - "!mkdir -p conf && cd conf && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/fastpitch_align.yaml && cd .." + "\n", + "!mkdir -p conf \\\n", + "&& cd conf \\\n", + "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/fastpitch_align_v1.05.yaml \\\n", + "&& cd .." + ] + }, + { + "cell_type": "markdown", + "id": "5415162b", + "metadata": {}, + "source": [ + "We also need some additional files (see `MixerTTS_FastPitch_Training.ipynb` tutorial for more details) for training. Let's download it too." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20374059", + "metadata": {}, + "outputs": [], + "source": [ + "# additional files\n", + "!mkdir -p tts_dataset_files && cd tts_dataset_files \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-030921 \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv \\\n", + "&& cd .." + ] + }, + { + "cell_type": "markdown", + "id": "779af190", + "metadata": { + "id": "ef75d1d5" + }, + "source": [ + "## Finetuning FastPitch" ] }, { @@ -227,9 +249,7 @@ "source": [ "We can now train our model with the following command:\n", "\n", - "**NOTE: This will take about 50 minutes on colab's K80 GPUs.**\n", - "\n", - "`python fastpitch_finetune.py --config-name=fastpitch_align.yaml train_dataset=./6097_manifest_train_dur_5_mins_local.json validation_datasets=./6097_manifest_dev_ns_all_local.json +init_from_nemo_model=./tts_en_fastpitch_align.nemo +trainer.max_steps=1000 ~trainer.max_epochs trainer.check_val_every_n_epoch=25 prior_folder=./Priors6097 model.train_ds.dataloader_params.batch_size=24 model.validation_ds.dataloader_params.batch_size=24 exp_manager.exp_dir=./ljspeech_to_6097_no_mixing_5_mins model.n_speakers=1 model.pitch_avg=121.9 model.pitch_std=23.1 model.pitch_fmin=30 model.pitch_fmax=512 model.optim.lr=2e-4 ~model.optim.sched model.optim.name=adam trainer.devices=1 trainer.strategy=null`" + "**NOTE: This will take about 50 minutes on colab's K80 GPUs.**" ] }, { @@ -241,19 +261,23 @@ }, "outputs": [], "source": [ - "!(python fastpitch_finetune.py --config-name=fastpitch_align.yaml \\\n", + "# TODO(oktai15): remove +model.text_tokenizer.add_blank_at=true when we update FastPitch checkpoint\n", + "!(python fastpitch_finetune.py --config-name=fastpitch_align_v1.05.yaml \\\n", " train_dataset=./6097_manifest_train_dur_5_mins_local.json \\\n", " validation_datasets=./6097_manifest_dev_ns_all_local.json \\\n", + " sup_data_path=./fastpitch_sup_data \\\n", + " phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", + " heteronyms_path=tts_dataset_files/heteronyms-030921 \\\n", + " whitelist_path=tts_dataset_files/whitelist_lj_speech.tsv \\\n", + " exp_manager.exp_dir=./ljspeech_to_6097_no_mixing_5_mins \\\n", " +init_from_nemo_model=./tts_en_fastpitch_align.nemo \\\n", " +trainer.max_steps=1000 ~trainer.max_epochs \\\n", " trainer.check_val_every_n_epoch=25 \\\n", - " prior_folder=./Priors6097 \\\n", - " model.train_ds.dataloader_params.batch_size=24 \\\n", - " model.validation_ds.dataloader_params.batch_size=24 \\\n", - " exp_manager.exp_dir=./ljspeech_to_6097_no_mixing_5_mins \\\n", - " model.n_speakers=1 model.pitch_avg=121.9 model.pitch_std=23.1 \\\n", + " model.train_ds.dataloader_params.batch_size=24 model.validation_ds.dataloader_params.batch_size=24 \\\n", + " model.n_speakers=1 model.pitch_mean=121.9 model.pitch_std=23.1 \\\n", " model.pitch_fmin=30 model.pitch_fmax=512 model.optim.lr=2e-4 \\\n", " ~model.optim.sched model.optim.name=adam trainer.devices=1 trainer.strategy=null \\\n", + " +model.text_tokenizer.add_blank_at=true \\\n", ")" ] }, @@ -266,11 +290,22 @@ "source": [ "Let's take a closer look at the training command:\n", "\n", - "* `python fastpitch_finetune.py --config-name=fastpitch_align.yaml`\n", + "* `python fastpitch_finetune.py --config-name=fastpitch_align_v1.05.yaml`\n", " * --config-name tells the script what config to use.\n", "\n", - "* `train_dataset=./6097_manifest_train_dur_5_mins_local.json validation_datasets=./6097_manifest_dev_ns_all_local.json`\n", - " * We tell the model what manifest files we can to train and eval on.\n", + "* `train_dataset=./6097_manifest_train_dur_5_mins_local.json \n", + " validation_datasets=./6097_manifest_dev_ns_all_local.json \n", + " sup_data_path=./fastpitch_sup_data`\n", + " * We tell the script what manifest files we can to train and eval on and where supplementary data is located or will be calculated and saved during training.\n", + " \n", + "* `phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.01 \n", + "heteronyms_path=tts_dataset_files/heteronyms-030921\n", + "whitelist_path=tts_dataset_files/whitelist_lj_speech.tsv \n", + "`\n", + " * We tell the script where `phoneme_dict_path`, `heteronyms-030921` and `whitelist_path` are located.\n", + " \n", + "* `exp_manager.exp_dir=./ljspeech_to_6097_no_mixing_5_mins`\n", + " * Where we want to save our log files, tensorboard file, checkpoints, and more.\n", "\n", "* `+init_from_nemo_model=./tts_en_fastpitch_align.nemo`\n", " * We tell the script what checkpoint to finetune from.\n", @@ -278,31 +313,28 @@ "* `+trainer.max_steps=1000 ~trainer.max_epochs trainer.check_val_every_n_epoch=25`\n", " * For this experiment, we need to tell the script to train for 1000 training steps/iterations. We need to remove max_epochs using `~trainer.max_epochs`.\n", "\n", - "* `prior_folder=./Priors6097 model.train_ds.dataloader_params.batch_size=24 model.validation_ds.dataloader_params.batch_size=24`\n", - " * Some dataset parameters. The dataset does some online processing and stores the processing steps to the `prior_folder`.\n", - "\n", - "* `exp_manager.exp_dir=./ljspeech_to_6097_no_mixing_5_mins`\n", - " * Where we want to save our log files, tensorboard file, checkpoints, and more\n", + "* `model.train_ds.dataloader_params.batch_size=24 model.validation_ds.dataloader_params.batch_size=24`\n", + " * Set batch sizes. \n", "\n", "* `model.n_speakers=1`\n", - " * The number of speakers in the data. There is only 1 for now, but we will revisit this parameter later in the notebook\n", + " * The number of speakers in the data. There is only 1 for now, but we will revisit this parameter later in the notebook.\n", "\n", - "* `model.pitch_avg=121.9 model.pitch_std=23.1 model.pitch_fmin=30 model.pitch_fmax=512`\n", + "* `model.pitch_mean=121.9 model.pitch_std=23.1 model.pitch_fmin=30 model.pitch_fmax=512`\n", " * For the new speaker, we need to define new pitch hyperparameters for better audio quality.\n", - " * These parameters work for speaker 6097 from the HiFiTTS dataset\n", - " * For speaker 92, we suggest `model.pitch_avg=214.5 model.pitch_std=30.9 model.pitch_fmin=80 model.pitch_fmax=512`\n", + " * These parameters work for speaker 6097 from the HiFiTTS dataset.\n", + " * For speaker 92, we suggest `model.pitch_mean=214.5 model.pitch_std=30.9 model.pitch_fmin=80 model.pitch_fmax=512`.\n", " * fmin and fmax are hyperparameters to librosa's pyin function. We recommend tweaking these per speaker.\n", - " * After fmin and fmax are defined, pitch mean and std can be easily extracted\n", + " * After fmin and fmax are defined, pitch mean and std can be easily extracted.\n", "\n", "* `model.optim.lr=2e-4 ~model.optim.sched model.optim.name=adam`\n", - " * For fine-tuning, we lower the learning rate\n", - " * We use a fixed learning rate of 2e-4\n", - " * We switch from the lamb optimizer to the adam optimizer\n", + " * For fine-tuning, we lower the learning rate.\n", + " * We use a fixed learning rate of 2e-4.\n", + " * We switch from the lamb optimizer to the adam optimizer.\n", "\n", "* `trainer.devices=1 trainer.strategy=null`\n", - " * For this notebook, we default to 1 gpu which means that we do not need ddp\n", - " * If you have the compute resources, feel free to scale this up to the number of free gpus you have available\n", - " * Please remove the `trainer.strategy=null` section if you intend on multi-gpu training" + " * For this notebook, we default to 1 gpu which means that we do not need ddp.\n", + " * If you have the compute resources, feel free to scale this up to the number of free gpus you have available.\n", + " * Please remove the `trainer.strategy=null` section if you intend on multi-gpu training." ] }, { @@ -312,10 +344,7 @@ "id": "c3bdf1ed" }, "source": [ - "## Synthesize Samples from Finetuned Checkpoints\n", - "\n", - "---\n", - "\n" + "## Synthesize Samples from Finetuned Checkpoints" ] }, { @@ -482,7 +511,6 @@ }, "source": [ "## Improving Speech Quality\n", - "___\n", "\n", "We see that from fine-tuning FastPitch, we were able to generate audio in a male voice but the audio quality is not as good as we expect. We recommend two steps to improve audio quality:\n", "\n", @@ -495,6 +523,20 @@ "From the synthesized samples, there might be audible audio crackling. To fix this, we need to finetune HiFiGAN on the new speaker's data. HiFiGAN shows improvement using synthesized mel spectrograms, so the first step is to generate mel spectrograms with our finetuned FastPitch model.\n", "\n", "```python\n", + "import json\n", + "import torch\n", + "import soundfile as sf\n", + "\n", + "from pathlib import Path\n", + "\n", + "from nemo.collections.tts.torch.helpers import BetaBinomialInterpolator\n", + "\n", + "\n", + "def load_wav(audio_file):\n", + " with sf.SoundFile(audio_file, 'r') as f:\n", + " samples = f.read(dtype='float32')\n", + " return samples.transpose()\n", + "\n", "# Get records from the training manifest\n", "manifest_path = \"./6097_manifest_train_dur_5_mins_local.json\"\n", "records = []\n", @@ -502,29 +544,56 @@ " for i, line in enumerate(f):\n", " records.append(json.loads(line))\n", "\n", - "# Generate a spectrogram for each item\n", + "beta_binomial_interpolator = BetaBinomialInterpolator()\n", + "spec_gen_model.eval()\n", + "\n", + "device = spec_gen_model.device\n", + "\n", + "save_dir = Path(\"./6097_manifest_train_dur_5_mins_local_mels\")\n", + "save_dir.mkdir(exist_ok=True, parents=True)\n", + "\n", + "# Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)\n", "for i, r in enumerate(records):\n", - " with torch.no_grad():\n", - " parsed = spec_gen_model.parse(r['text'])\n", - " spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker=1)\n", - " if isinstance(spectrogram, torch.Tensor):\n", - " spectrogram = spectrogram.to('cpu').numpy()\n", - " if len(spectrogram.shape) == 3:\n", - " spectrogram = spectrogram[0]\n", - " np.save(f\"mel_{i}\", spectrogram)\n", - " r[\"mel_filepath\"] = f\"mel_{i}.npy\"\n", - "\n", - "# Save to a new json\n", - "with open(\"hifigan_train_ft.json\", \"w\") as f:\n", - " for r in records:\n", - " f.write(json.dumps(r) + '\\n')\n", + " audio = load_wav(r[\"audio_filepath\"])\n", + " audio = torch.from_numpy(audio).unsqueeze(0).to(device)\n", + " audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)\n", + " \n", + " with torch.no_grad():\n", + " if \"normalized_text\" in r:\n", + " text = spec_gen_model.parse(r[\"normalized_text\"], normalize=False)\n", + " else:\n", + " text = spec_gen_model.parse(r['text'])\n", + " \n", + " text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)\n", + " \n", + " spect, spect_len = spec_gen_model.preprocessor(input_signal=audio, length=audio_len)\n", "\n", + " attn_prior = torch.from_numpy(\n", + " beta_binomial_interpolator(spect_len.item(), text_len.item())\n", + " ).unsqueeze(0).to(text.device)\n", + " \n", + " spectrogram = spec_gen_model.forward(\n", + " text=text, \n", + " input_lens=text_len, \n", + " spec=spect, \n", + " mel_lens=spect_len, \n", + " attn_prior=attn_prior, \n", + " )[0]\n", + " \n", + " save_path = save_dir / f\"mel_{i}.npy\"\n", + " np.save(save_path, spectrogram[0].to('cpu').numpy())\n", + " r[\"mel_filepath\"] = str(save_path)\n", + "\n", + "hifigan_manifest_path = \"hifigan_train_ft.json\"\n", + "with open(hifigan_manifest_path, \"w\") as f:\n", + " for r in records:\n", + " f.write(json.dumps(r) + '\\n')\n", "# Please do the same for the validation json. Code is omitted.\n", "```\n", "\n", "We can then finetune hifigan similarly to fastpitch using NeMo's [hifigan_finetune.py](https://github.com/NVIDIA/NeMo/blob/main/examples/tts/hifigan_finetune.py) and [hifigan.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/tts/conf/hifigan/hifigan.yaml):\n", "\n", - "`python examples/tts/hifigan_finetune.py --config_name=hifigan.yaml model.train_ds.dataloader_params.batch_size=32 model.max_steps=1000 ~model.sched model.optim.lr=0.0001 train_dataset=./hifigan_train_ft.json validation_datasets=./hifigan_val_ft.json exp_manager.exp_dir=hifigan_ft +init_from_nemo_model=tts_hifigan.nemo trainer.check_val_every_n_epoch=10 model/train_ds=train_ds_finetune`\n", + "`python examples/tts/hifigan_finetune.py --config-name=hifigan.yaml model.train_ds.dataloader_params.batch_size=32 model.max_steps=1000 ~model.optim.sched model.optim.lr=0.0001 train_dataset=./hifigan_train_ft.json validation_datasets=./hifigan_val_ft.json exp_manager.exp_dir=hifigan_ft +init_from_nemo_model=tts_hifigan.nemo trainer.check_val_every_n_epoch=10 model/train_ds=train_ds_finetune model/validation_ds=val_ds_finetune`\n", "\n", "### Improving TTS by Adding More Data\n", "We can add more data in two ways. they can be combined for the best effect:\n", diff --git a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb index 4db6968ca7a3..45acbf931173 100644 --- a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb +++ b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb @@ -50,10 +50,10 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", + "BRANCH = 'main'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode\n", - "# BRANCH = 'main'\n", "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, @@ -91,7 +91,7 @@ "\n", "FastPitch is non-autoregressive model for mel-spectrogram generation based on FastSpeech, conditioned on fundamental frequency contours. For more details about model, please refer to the original [paper](https://arxiv.org/abs/2006.06873). NeMo re-implementation of FastPitch additionally uses unsupervised speech-text [aligner](https://arxiv.org/abs/2108.10447) which was originally implemented in [FastPitch 1.1](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/FastPitch).\n", "\n", - "### MixerTTS\n", + "### Mixer-TTS\n", "\n", "Mixer-TTS is another non-autoregressive model for mel-spectrogram generation. It is structurally similar to FastPitch: duration prediction, pitch prediction, unsupervised TTS alignment framework, but the main difference is that Mixer-TTS is based on the [MLP-Mixer](https://arxiv.org/abs/2105.01601) architecture adapted for speech synthesis.\n", "\n", @@ -226,9 +226,9 @@ "\n", "# additional files\n", "!mkdir -p tts_dataset_files && cd tts_dataset_files \\\n", - "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", - "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/heteronyms-030921 \\\n", - "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-030921 \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv \\\n", "&& cd .." ] }, @@ -251,10 +251,10 @@ "metadata": {}, "outputs": [], "source": [ - "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/tts/fastpitch.py\n", + "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/fastpitch.py\n", "\n", "!mkdir -p conf && cd conf \\\n", - "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/tts/conf/fastpitch_align_v1.05.yaml \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/fastpitch_align_v1.05.yaml \\\n", "&& cd .." ] }, @@ -392,10 +392,10 @@ "metadata": {}, "outputs": [], "source": [ - "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/tts/mixer_tts.py\n", + "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/mixer_tts.py\n", "\n", "!mkdir -p conf && cd conf \\\n", - "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/tts/conf/mixer-tts.yaml \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/mixer-tts.yaml \\\n", "&& cd .." ] }, @@ -533,7 +533,7 @@ "id": "2d9745fc", "metadata": {}, "source": [ - "### MixerTTS\n", + "### Mixer-TTS\n", "\n", "Now we are ready for training our model! Let's try to train Mixer-TTS.\n", "\n", @@ -601,7 +601,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.8.6" } }, "nbformat": 4, diff --git a/tutorials/tts/Inference_DurationPitchControl.ipynb b/tutorials/tts/Inference_DurationPitchControl.ipynb index 5dec260fc9f0..b632d4e91639 100644 --- a/tutorials/tts/Inference_DurationPitchControl.ipynb +++ b/tutorials/tts/Inference_DurationPitchControl.ipynb @@ -46,11 +46,11 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "\"\"\"\n", + "BRANCH = 'main'\n", "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode\n", - "# BRANCH = 'main'\n", - "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[tts]" + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, { @@ -504,7 +504,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.6" } }, "nbformat": 4, diff --git a/tutorials/tts/Inference_ModelSelect.ipynb b/tutorials/tts/Inference_ModelSelect.ipynb index 30daf755ba94..74d6bd1a6c4d 100644 --- a/tutorials/tts/Inference_ModelSelect.ipynb +++ b/tutorials/tts/Inference_ModelSelect.ipynb @@ -46,11 +46,11 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "\"\"\"\n", + "BRANCH = 'main'\n", "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode\n", - "# BRANCH = 'main'\n", - "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[tts]" + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, { @@ -410,4 +410,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb index 1cd7a799330b..62e8af61d8db 100644 --- a/tutorials/tts/Tacotron2_Training.ipynb +++ b/tutorials/tts/Tacotron2_Training.ipynb @@ -58,7 +58,7 @@ "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode\n", - "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[tts]" + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, { @@ -316,7 +316,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -330,7 +330,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.6" } }, "nbformat": 4, diff --git a/tutorials/tts/TalkNet_Training.ipynb b/tutorials/tts/TalkNet_Training.ipynb index f44f1591eae8..1cfd69b88ac5 100644 --- a/tutorials/tts/TalkNet_Training.ipynb +++ b/tutorials/tts/TalkNet_Training.ipynb @@ -50,10 +50,10 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", + "BRANCH = 'main'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode pysptk\n", - "# BRANCH = 'main'\n", "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, @@ -496,7 +496,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -510,7 +510,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.11" + "version": "3.8.6" } }, "nbformat": 4,