From 30db4d4f3fff2cd8304e4b64e5141f87e5292123 Mon Sep 17 00:00:00 2001 From: treacker <36159472+treacker@users.noreply.github.com> Date: Wed, 11 May 2022 19:25:37 +0400 Subject: [PATCH] Tacotron2 retrain (#4103) * fix yaml Signed-off-by: treacker * Fix for new TTSDataset class Signed-off-by: treacker * added wandb logging Signed-off-by: treacker * added wandb logging Signed-off-by: treacker * fix numpy version Signed-off-by: treacker * fix numpy version Signed-off-by: treacker * inference fix Signed-off-by: treacker * removed old code Signed-off-by: treacker * updated parser logic Signed-off-by: treacker * reverted version update Signed-off-by: treacker * refactored parser logic Signed-off-by: treacker * Updated Jenkinsfile Signed-off-by: treacker * Refactored tutorial for Tacotron2 Signed-off-by: treacker * Made backward compatibility Signed-off-by: treacker * Made backward compatibility Signed-off-by: treacker * Update Jenkinsfile Signed-off-by: treacker * Update tacotron.yaml Signed-off-by: treacker * Refactoring Signed-off-by: treacker * cleaned up TN/ ITN doc (#4119) * cleaned up TN/ ITN doc Signed-off-by: Yang Zhang * fix typo Signed-off-by: Yang Zhang * fix image Signed-off-by: Yang Zhang * fix image Signed-off-by: Yang Zhang Signed-off-by: treacker * Check implicit grad acc in GLUE dataset building (#4123) * Check implicit grad acc in GLUE dataset building Signed-off-by: MaximumEntropy * Fix jenkins test for GLUE/XNLI Signed-off-by: MaximumEntropy Signed-off-by: treacker * Refactoring Signed-off-by: treacker * Refactoring Signed-off-by: treacker * Fixed jenkins Signed-off-by: treacker * Refactoring Signed-off-by: treacker * Refactoring Signed-off-by: treacker * Refactoring Signed-off-by: treacker Co-authored-by: Yang Zhang Co-authored-by: Sandeep Subramanian --- Jenkinsfile | 12 +- examples/tts/conf/tacotron2.yaml | 163 ++++-- nemo/collections/tts/helpers/helpers.py | 68 +++ nemo/collections/tts/models/tacotron2.py | 172 ++++-- tutorials/tts/Tacotron2_Training.ipynb | 680 +++++++++++------------ 5 files changed, 640 insertions(+), 455 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index a8b6b8067ba3..63978e89c011 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2983,7 +2983,6 @@ pipeline { } } parallel { - // TODO(Oktai15): update it in 1.8.0 version stage('Tacotron 2') { steps { sh 'python examples/tts/tacotron2.py \ @@ -2993,13 +2992,18 @@ pipeline { trainer.accelerator="gpu" \ +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ trainer.strategy=null \ - model.train_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.batch_size=4 \ model.decoder.decoder_rnn_dim=256 \ model.decoder.attention_rnn_dim=1024 \ model.decoder.prenet_dim=128 \ model.postnet.postnet_n_convolutions=3 \ - ~trainer.check_val_every_n_epoch' + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=1 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=1 \ + ~model.text_normalizer \ + ~model.text_normalizer_call_kwargs \ + ~trainer.check_val_every_n_epoch \ + ' } } stage('WaveGlow') { diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml index 58fbf4d750c2..a12b8d5489d6 100644 --- a/examples/tts/conf/tacotron2.yaml +++ b/examples/tts/conf/tacotron2.yaml @@ -1,81 +1,136 @@ -# TODO(Oktai15): update this config in 1.8.0 version +# This config contains the default values for training Tacotron2 model on LJSpeech dataset. +# If you want to train model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. name: Tacotron2 -sample_rate: 22050 -# , , will be added by the tacotron2.py script -labels: [' ', '!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', - 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', - 'u', 'v', 'w', 'x', 'y', 'z'] -n_fft: 1024 -n_mels: 80 -fmax: 8000 -n_stride: 256 -pad_value: -11.52 + train_dataset: ??? validation_datasets: ??? +sup_data_path: null +sup_data_types: null + +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" +whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv" + + model: - labels: ${labels} + pitch_fmin: 65.40639132514966 + pitch_fmax: 2093.004522404789 + + sample_rate: 22050 + n_mel_channels: 80 + n_window_size: 1024 + n_window_stride: 256 + n_fft: 1024 + lowfreq: 0 + highfreq: 8000 + window: hann + pad_value: -11.52 + + + text_normalizer: + _target_: nemo_text_processing.text_normalization.normalize.Normalizer + lang: en + input_case: cased + whitelist: ${whitelist_path} + + text_normalizer_call_kwargs: + verbose: false + punct_pre_process: true + punct_post_process: true + + text_tokenizer: + _target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer + punct: true + stresses: true + chars: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo.collections.tts.torch.g2ps.EnglishG2p + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + train_ds: dataset: - _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset" + _target_: "nemo.collections.tts.torch.data.TTSDataset" manifest_filepath: ${train_dataset} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} max_duration: null min_duration: 0.1 - trim: false - int_values: false - normalize: true - sample_rate: ${sample_rate} - # bos_id: 66 - # eos_id: 67 - # pad_id: 68 These parameters are added automatically in Tacotron2 + ignore_file: null + trim: False + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} dataloader_params: drop_last: false shuffle: true batch_size: 48 num_workers: 4 - - + pin_memory: false + validation_ds: dataset: - _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset" - manifest_filepath: ${validation_datasets} + _target_: "nemo.collections.tts.torch.data.TTSDataset" + manifest_filepath: ${train_dataset} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} max_duration: null min_duration: 0.1 - int_values: false - normalize: true - sample_rate: ${sample_rate} - trim: false - # bos_id: 66 - # eos_id: 67 - # pad_id: 68 These parameters are added automatically in Tacotron2 + ignore_file: null + trim: False + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} dataloader_params: drop_last: false shuffle: false - batch_size: 48 + batch_size: 24 num_workers: 8 + pin_memory: false preprocessor: _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - dither: 0.0 - nfilt: ${n_mels} - frame_splicing: 1 - highfreq: ${fmax} + nfilt: ${model.n_mel_channels} + highfreq: ${model.highfreq} log: true log_zero_guard_type: clamp log_zero_guard_value: 1e-05 - lowfreq: 0 - mag_power: 1.0 - n_fft: ${n_fft} - n_window_size: 1024 - n_window_stride: ${n_stride} - normalize: null + lowfreq: ${model.lowfreq} + n_fft: ${model.n_fft} + n_window_size: ${model.n_window_size} + n_window_stride: ${model.n_window_stride} pad_to: 16 - pad_value: ${pad_value} + pad_value: ${model.pad_value} + sample_rate: ${model.sample_rate} + window: ${model.window} + normalize: null preemph: null - sample_rate: ${sample_rate} - window: hann + dither: 0.0 + frame_splicing: 1 + stft_conv: false + nb_augmentation_prob : 0 + mag_power: 1.0 + exact_pad: true + use_grads: false encoder: _target_: nemo.collections.tts.modules.tacotron2.Encoder @@ -90,7 +145,7 @@ model: gate_threshold: 0.5 max_decoder_steps: 1000 n_frames_per_step: 1 # currently only 1 is supported - n_mel_channels: ${n_mels} + n_mel_channels: ${model.n_mel_channels} p_attention_dropout: 0.1 p_decoder_dropout: 0.1 prenet_dim: 256 @@ -105,7 +160,7 @@ model: postnet: _target_: nemo.collections.tts.modules.tacotron2.Postnet - n_mel_channels: ${n_mels} + n_mel_channels: ${model.n_mel_channels} p_dropout: 0.5 postnet_embedding_dim: 512 postnet_kernel_size: 5 @@ -132,11 +187,15 @@ trainer: enable_checkpointing: False # Provided by exp_manager logger: False # Provided by exp_manager gradient_clip_val: 1.0 - log_every_n_steps: 200 - check_val_every_n_epoch: 25 + log_every_n_steps: 60 + check_val_every_n_epoch: 2 + exp_manager: exp_dir: null name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + mode: min diff --git a/nemo/collections/tts/helpers/helpers.py b/nemo/collections/tts/helpers/helpers.py index fea53bf1d8fa..995c6bd59fac 100644 --- a/nemo/collections/tts/helpers/helpers.py +++ b/nemo/collections/tts/helpers/helpers.py @@ -56,6 +56,12 @@ from nemo.utils import logging +HAVE_WANDB = True +try: + import wandb +except ModuleNotFoundError: + HAVE_WANDB = False + try: from pytorch_lightning.utilities import rank_zero_only except ModuleNotFoundError: @@ -284,6 +290,7 @@ def tacotron2_log_to_tb_func( step, dataformats="HWC", ) + if add_audio: filterbank = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmax=fmax) log_mel = mel_postnet[0].data.cpu().numpy().T @@ -299,6 +306,67 @@ def tacotron2_log_to_tb_func( swriter.add_audio(f"audio/{tag}_target", audio / max(np.abs(audio)), step, sample_rate=sr) +def tacotron2_log_to_wandb_func( + swriter, + tensors, + step, + tag="train", + log_images=False, + log_images_freq=1, + add_audio=True, + griffin_lim_mag_scale=1024, + griffin_lim_power=1.2, + sr=22050, + n_fft=1024, + n_mels=80, + fmax=8000, +): + _, spec_target, mel_postnet, gate, gate_target, alignments = tensors + if not HAVE_WANDB: + return + if log_images and step % log_images_freq == 0: + alignments = [] + specs = [] + gates = [] + alignments += [ + wandb.Image(plot_alignment_to_numpy(alignments[0].data.cpu().numpy().T), caption=f"{tag}_alignment",) + ] + alignments += [ + wandb.Image(plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()), caption=f"{tag}_mel_target",), + wandb.Image(plot_spectrogram_to_numpy(mel_postnet[0].data.cpu().numpy()), caption=f"{tag}_mel_predicted",), + ] + gates += [ + wandb.Image( + plot_gate_outputs_to_numpy( + gate_target[0].data.cpu().numpy(), torch.sigmoid(gate[0]).data.cpu().numpy(), + ), + caption=f"{tag}_gate", + ) + ] + + swriter.log({"specs": specs, "alignments": alignments, "gates": gates}) + + if add_audio: + audios = [] + filterbank = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmax=fmax) + log_mel = mel_postnet[0].data.cpu().numpy().T + mel = np.exp(log_mel) + magnitude = np.dot(mel, filterbank) * griffin_lim_mag_scale + audio_pred = griffin_lim(magnitude.T ** griffin_lim_power) + + log_mel = spec_target[0].data.cpu().numpy().T + mel = np.exp(log_mel) + magnitude = np.dot(mel, filterbank) * griffin_lim_mag_scale + audio_true = griffin_lim(magnitude.T ** griffin_lim_power) + + audios += [ + wandb.Audio(audio_true / max(np.abs(audio_true)), caption=f"{tag}_wav_target", sample_rate=sr,), + wandb.Audio(audio_pred / max(np.abs(audio_pred)), caption=f"{tag}_wav_predicted", sample_rate=sr,), + ] + + swriter.log({"audios": audios}) + + def plot_alignment_to_numpy(alignment, info=None): fig, ax = plt.subplots(figsize=(6, 4)) im = ax.imshow(alignment, aspect='auto', origin='lower', interpolation='none') diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py index baae5f2f43fb..60146741d900 100644 --- a/nemo/collections/tts/models/tacotron2.py +++ b/nemo/collections/tts/models/tacotron2.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib from dataclasses import dataclass from typing import Any, Dict, List, Optional @@ -19,11 +20,15 @@ from hydra.utils import instantiate from omegaconf import MISSING, DictConfig, OmegaConf, open_dict from omegaconf.errors import ConfigAttributeError -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger +from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger, WandbLogger from torch import nn from nemo.collections.common.parts.preprocessing import parsers -from nemo.collections.tts.helpers.helpers import get_mask_from_lengths, tacotron2_log_to_tb_func +from nemo.collections.tts.helpers.helpers import ( + get_mask_from_lengths, + tacotron2_log_to_tb_func, + tacotron2_log_to_wandb_func, +) from nemo.collections.tts.losses.tacotron2loss import Tacotron2Loss from nemo.collections.tts.models.base import SpectrogramGenerator from nemo.core.classes.common import PretrainedModelInfo, typecheck @@ -36,7 +41,7 @@ SequenceToSequenceAlignmentType, ) from nemo.core.neural_types.neural_type import NeuralType -from nemo.utils import logging +from nemo.utils import logging, model_utils @dataclass @@ -60,8 +65,28 @@ class Tacotron2Model(SpectrogramGenerator): """Tacotron 2 Model that is used to generate mel spectrograms from text""" def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): - if isinstance(cfg, dict): - cfg = OmegaConf.create(cfg) + # Convert to Hydra 1.0 compatible DictConfig + cfg = model_utils.convert_model_config_to_dict_config(cfg) + cfg = model_utils.maybe_update_config_version(cfg) + + # setup normalizer + self.normalizer = None + self.text_normalizer_call = None + self.text_normalizer_call_kwargs = {} + self._setup_normalizer(cfg) + + # setup tokenizer + self.tokenizer = None + if hasattr(cfg, 'text_tokenizer'): + self._setup_tokenizer(cfg) + + self.num_tokens = len(self.tokenizer.tokens) + self.tokenizer_pad = self.tokenizer.pad + self.tokenizer_unk = self.tokenizer.oov + # assert self.tokenizer is not None + else: + self.num_tokens = len(cfg.labels) + 3 + super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(Tacotron2Config) @@ -73,17 +98,17 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Ensure passed cfg is compliant with schema try: OmegaConf.merge(cfg, schema) - self.pad_value = self._cfg.preprocessor.pad_value + self.pad_value = cfg.preprocessor.pad_value except ConfigAttributeError: - self.pad_value = self._cfg.preprocessor.params.pad_value + self.pad_value = cfg.preprocessor.params.pad_value logging.warning( "Your config is using an old NeMo yaml configuration. Please ensure that the yaml matches the " "current version in the main branch for future compatibility." ) self._parser = None - self.audio_to_melspec_precessor = instantiate(self._cfg.preprocessor) - self.text_embedding = nn.Embedding(len(cfg.labels) + 3, 512) + self.audio_to_melspec_precessor = instantiate(cfg.preprocessor) + self.text_embedding = nn.Embedding(self.num_tokens, 512) self.encoder = instantiate(self._cfg.encoder) self.decoder = instantiate(self._cfg.decoder) self.postnet = instantiate(self._cfg.postnet) @@ -94,46 +119,45 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): def parser(self): if self._parser is not None: return self._parser - if self._validation_dl is not None: - return self._validation_dl.dataset.manifest_processor.parser - if self._test_dl is not None: - return self._test_dl.dataset.manifest_processor.parser - if self._train_dl is not None: - return self._train_dl.dataset.manifest_processor.parser - - # Else construct a parser - # Try to get params from validation, test, and then train - params = {} - try: - params = self._cfg.validation_ds.dataset - except ConfigAttributeError: - pass - if params == {}: - try: - params = self._cfg.test_ds.dataset - except ConfigAttributeError: - pass - if params == {}: - try: - params = self._cfg.train_ds.dataset - except ConfigAttributeError: - pass - - name = params.get('parser', None) or 'en' - unk_id = params.get('unk_index', None) or -1 - blank_id = params.get('blank_index', None) or -1 - do_normalize = params.get('normalize', True) - self._parser = parsers.make_parser( - labels=self._cfg.labels, name=name, unk_id=unk_id, blank_id=blank_id, do_normalize=do_normalize, - ) + + ds_class_name = self._cfg.train_ds.dataset._target_.split(".")[-1] + if ds_class_name == "TTSDataset": + self._parser = None + elif hasattr(self._cfg, "labels"): + self._parser = parsers.make_parser( + labels=self._cfg.labels, + name='en', + unk_id=-1, + blank_id=-1, + do_normalize=True, + abbreviation_version="fastpitch", + make_table=False, + ) + elif ds_class_name == "AudioToCharWithPriorAndPitchDataset": + self.parser = self.vocab.encode + else: + raise ValueError("Wanted to setup parser, but model does not have necessary paramaters") + return self._parser - def parse(self, str_input: str) -> torch.tensor: - tokens = self.parser(str_input) - # Parser doesn't add bos and eos ids, so maunally add it - tokens = [len(self._cfg.labels)] + tokens + [len(self._cfg.labels) + 1] + def parse(self, text: str, normalize=True) -> torch.Tensor: + if self.training: + logging.warning("parse() is meant to be called in eval mode.") + if normalize and self.text_normalizer_call is not None: + text = self.text_normalizer_call(text, **self.text_normalizer_call_kwargs) + + eval_phon_mode = contextlib.nullcontext() + if hasattr(self.tokenizer, "set_phone_prob"): + eval_phon_mode = self.tokenizer.set_phone_prob(prob=1.0) + + with eval_phon_mode: + if self.tokenizer is not None: + tokens = self.tokenizer.encode(text) + else: + tokens = self.parser(text) + # Old parser doesn't add bos and eos ids, so maunally add it + tokens = [len(self._cfg.labels)] + tokens + [len(self._cfg.labels) + 1] tokens_tensor = torch.tensor(tokens).unsqueeze_(0).to(self.device) - return tokens_tensor @property @@ -259,18 +283,56 @@ def validation_step(self, batch, batch_idx): def validation_epoch_end(self, outputs): if self.logger is not None and self.logger.experiment is not None: - tb_logger = self.logger.experiment + logger = self.logger.experiment if isinstance(self.logger, LoggerCollection): for logger in self.logger: if isinstance(logger, TensorBoardLogger): - tb_logger = logger.experiment + logger = logger.experiment break - tacotron2_log_to_tb_func( - tb_logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False, - ) + if isinstance(logger, TensorBoardLogger): + tacotron2_log_to_tb_func( + logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False, + ) + elif isinstance(logger, WandbLogger): + tacotron2_log_to_wandb_func( + logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False, + ) avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() # This reduces across batches, not workers! self.log('val_loss', avg_loss) + def _setup_normalizer(self, cfg): + if "text_normalizer" in cfg: + normalizer_kwargs = {} + + if "whitelist" in cfg.text_normalizer: + normalizer_kwargs["whitelist"] = self.register_artifact( + 'text_normalizer.whitelist', cfg.text_normalizer.whitelist + ) + + self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) + self.text_normalizer_call = self.normalizer.normalize + if "text_normalizer_call_kwargs" in cfg: + self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs + + def _setup_tokenizer(self, cfg): + text_tokenizer_kwargs = {} + if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None: + g2p_kwargs = {} + + if "phoneme_dict" in cfg.text_tokenizer.g2p: + g2p_kwargs["phoneme_dict"] = self.register_artifact( + 'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict, + ) + + if "heteronyms" in cfg.text_tokenizer.g2p: + g2p_kwargs["heteronyms"] = self.register_artifact( + 'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms, + ) + + text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs) + + self.tokenizer = instantiate(cfg.text_tokenizer, **text_tokenizer_kwargs) + def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") @@ -289,11 +351,13 @@ def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, na elif not shuffle_should_be and cfg.dataloader_params.shuffle: logging.error(f"The {name} dataloader for {self} has shuffle set to True!!!") - labels = self._cfg.labels - dataset = instantiate( - cfg.dataset, labels=labels, bos_id=len(labels), eos_id=len(labels) + 1, pad_id=len(labels) + 2 + cfg.dataset, + text_normalizer=self.normalizer, + text_normalizer_call_kwargs=self.text_normalizer_call_kwargs, + text_tokenizer=self.tokenizer, ) + return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb index 8109b734af60..be021cb78212 100644 --- a/tutorials/tts/Tacotron2_Training.ipynb +++ b/tutorials/tts/Tacotron2_Training.ipynb @@ -1,347 +1,337 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "htbJiaJjYQAD" - }, - "source": [ - "# Tacotron 2 Training\n", - "\n", - "This notebook is designed to provide a guide on how to train Tacotron2 as part of the TTS pipeline. It contains the following sections\n", - "\n", - " 1. Tacotron2 and NeMo - An introduction to the Tacotron2 model\n", - " 2. LJSpeech - How to train Tacotron2 on LJSpeech\n", - " 3. Custom Datasets - How to collect audio data to train Tacotron2 for difference voices and languages" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wqPMTEXXYUP4" - }, - "source": [ - "# License\n", - "\n", - "> Copyright 2020 NVIDIA. All Rights Reserved.\n", - "> \n", - "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "> you may not use this file except in compliance with the License.\n", - "> You may obtain a copy of the License at\n", - "> \n", - "> http://www.apache.org/licenses/LICENSE-2.0\n", - "> \n", - "> Unless required by applicable law or agreed to in writing, software\n", - "> distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "> See the License for the specific language governing permissions and\n", - "> limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SUkq9HAvYU7T" - }, - "outputs": [], - "source": [ - "\"\"\"\n", - "You can either run this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies# .\n", - "\"\"\"\n", - "BRANCH = 'r1.9.0'\n", - "# # If you're using Colab and not running locally, uncomment and run this cell.\n", - "# !apt-get install sox libsndfile1 ffmpeg\n", - "# !pip install wget unidecode\n", - "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZivXzmq0YYLj" - }, - "source": [ - "# Tacotron2 and NeMo\n", - "\n", - "Tacotron2 is a neural network that converts text characters into a mel spectrogram. For more details on the model, please refer to Nvidia's [Tacotron2 Model Card](https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_tacotron2), or the original [paper](https://arxiv.org/abs/1712.05884).\n", - "\n", - "Tacotron2 like most NeMo models are defined as a LightningModule, allowing for easy training via PyTorch Lightning, and parameterized by a configuration, currently defined via a yaml file and loading using Hydra.\n", - "\n", - "Let's take a look using NeMo's pretrained model and how to use it to generate spectrograms." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HEvdSU5WYZbj" - }, - "outputs": [], - "source": [ - "# Load the Tacotron2Model\n", - "from nemo.collections.tts.models import Tacotron2Model\n", - "from nemo.collections.tts.models.base import SpectrogramGenerator\n", - "\n", - "# Let's see what pretrained models are available\n", - "print(Tacotron2Model.list_available_models())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3W8unatgYbUp" - }, - "outputs": [], - "source": [ - "# We can load the pre-trained model as follows\n", - "model = Tacotron2Model.from_pretrained(\"tts_en_tacotron2\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xsyBa9tIdHp4" - }, - "outputs": [], - "source": [ - "# Tacotron2 is a SpectrogramGenerator\n", - "assert isinstance(model, SpectrogramGenerator)\n", - "\n", - "# SpectrogramGenerators in NeMo have two helper functions:\n", - "# 1. parse(str_input: str, **kwargs) which takes an English string and produces a token tensor\n", - "# 2. generate_spectrogram(tokens: 'torch.tensor', **kwargs) which takes the token tensor and generates a spectrogram\n", - "# Let's try it out\n", - "tokens = model.parse(str_input = \"Hey, this produces speech!\")\n", - "spectrogram = model.generate_spectrogram(tokens = tokens)\n", - "\n", - "# Now we can visualize the generated spectrogram\n", - "# If we want to generate speech, we have to use a vocoder in conjunction to a spectrogram generator.\n", - "# Refer to the TTS Inference notebook on how to convert spectrograms to speech.\n", - "from matplotlib.pyplot import imshow\n", - "from matplotlib import pyplot as plt\n", - "%matplotlib inline\n", - "imshow(spectrogram.cpu().detach().numpy()[0,...], origin=\"lower\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zZ90eCfdrNIf" - }, - "source": [ - "# Training\n", - "\n", - "Now that we looked at the Tacotron2 model, let's see how to train a Tacotron2 Model\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7rHG-LERrPRY" - }, - "outputs": [], - "source": [ - "# NeMo's training scripts are stored inside the examples/ folder. Let's grab the tacotron2.py file\n", - "# as well as the tacotron2.yaml file\n", - "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/tacotron2.py\n", - "!mkdir -p conf && cd conf && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/tacotron2.yaml && cd .." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Upv_LxBIsC51" - }, - "source": [ - "Let's take a look at the tacotron2.py file\n", - "\n", - "```python\n", - "import pytorch_lightning as pl\n", - "\n", - "from nemo.collections.common.callbacks import LogEpochTimeCallback\n", - "from nemo.collections.tts.models import Tacotron2Model\n", - "from nemo.core.config import hydra_runner\n", - "from nemo.utils.exp_manager import exp_manager\n", - "\n", - "\n", - "# hydra_runner is a thin NeMo wrapper around Hydra\n", - "# It looks for a config named tacotron2.yaml inside the conf folder\n", - "# Hydra parses the yaml and returns it as a Omegaconf DictConfig\n", - "@hydra_runner(config_path=\"conf\", config_name=\"tacotron2\")\n", - "def main(cfg):\n", - " # Define the Lightning trainer\n", - " trainer = pl.Trainer(**cfg.trainer)\n", - " # exp_manager is a NeMo construct that helps with logging and checkpointing\n", - " exp_manager(trainer, cfg.get(\"exp_manager\", None))\n", - " # Define the Tacotron 2 model, this will construct the model as well as\n", - " # define the training and validation dataloaders\n", - " model = Tacotron2Model(cfg=cfg.model, trainer=trainer)\n", - " # Let's add a few more callbacks\n", - " lr_logger = pl.callbacks.LearningRateMonitor()\n", - " epoch_time_logger = LogEpochTimeCallback()\n", - " trainer.callbacks.extend([lr_logger, epoch_time_logger])\n", - " # Call lightning trainer's fit() to train the model\n", - " trainer.fit(model)\n", - "\n", - "\n", - "if __name__ == '__main__':\n", - " main() # noqa pylint: disable=no-value-for-parameter\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6nM-fZO-s75u" - }, - "source": [ - "Let's take a look at the yaml config\n", - "\n", - "```yaml\n", - "name: &name Tacotron2\n", - "sample_rate: &sr 22050\n", - "# , , will be added by the tacotron2.py script\n", - "labels: &labels [' ', '!', '\"', \"'\", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',\n", - " 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']',\n", - " 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',\n", - " 'u', 'v', 'w', 'x', 'y', 'z']\n", - "n_fft: &n_fft 1024\n", - "n_mels: &n_mels 80\n", - "fmax: &fmax null\n", - "n_stride: &n_window_stride 256\n", - "pad_value: &pad_value -11.52\n", - "train_dataset: ???\n", - "validation_datasets: ???\n", - "```\n", - "\n", - "The first part of the yaml defines some parameters used by Tacotron. You can see\n", - "that the sample rate is set to 22050 for LJSpeech. You can also see that this\n", - "model has characters for labels instead of phones. To use phones as input,\n", - "see the GlowTTS yaml and setup for an example.\n", - "\n", - "Looking at the yaml, there is `train_dataset: ???` and `validation_datasets: ???`. The ??? indicates to hydra that these values must be passed via the command line or the script will fail.\n", - "\n", - "Looking further down the yaml, we get to the pytorch lightning trainer parameters.\n", - "\n", - "```yaml\n", - "trainer:\n", - " devices: 1 # number of gpus\n", - " accelerator: 'gpu' \n", - " max_epochs: ???\n", - " num_nodes: 1\n", - " accelerator: 'gpu'\n", - " strategy: 'dp'\n", - " accumulate_grad_batches: 1\n", - " enable_checkpointing: False # Provided by exp_manager\n", - " logger: False # Provided by exp_manager\n", - " gradient_clip_val: 1.0\n", - " log_every_n_steps: 200\n", - " check_val_every_n_epoch: 25\n", - "```\n", - "\n", - "These values can be changed either by editing the yaml or through the command line.\n", - "\n", - "Let's grab some simple audio data and test Tacotron2." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GnEzODcorugt" - }, - "outputs": [], - "source": [ - "!wget https://github.com/NVIDIA/NeMo/releases/download/v0.11.0/test_data.tar.gz \\\n", - " && mkdir -p tests/data \\\n", - " && tar xzf test_data.tar.gz -C tests/data\n", - "\n", - "# Just like ASR, the Tacotron2 require .json files to define the training and validation data.\n", - "!cat tests/data/asr/an4_val.json\n", - "\n", - "# Now that we have some sample data, we can try training Tacotron 2\n", - "# NOTE: The sample data is not enough data to properly train a Tacotron 2. This will not result in a trained Tacotron 2 and is used to illustrate how to train Tacotron 2 model\n", - "!python tacotron2.py \\\n", - "sample_rate=16000 \\\n", - "train_dataset=tests/data/asr/an4_train.json \\\n", - "validation_datasets=tests/data/asr/an4_val.json \\\n", - "trainer.max_epochs=3 \\\n", - "trainer.accelerator=null trainer.check_val_every_n_epoch=1 \\\n", - "+trainer.gpus=1" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9erGDGZJ1H_p" - }, - "source": [ - "# Training Data\n", - "\n", - "In order to train Tacotron2, it is highly recommended to obtain high quality speech data with the following properties:\n", - " - Sampling rate of 22050Hz or higher\n", - " - Single speaker\n", - " - Speech should contain a variety of speech phonemes\n", - " - Audio split into segments of 1-10 seconds\n", - " - Audio segments should not have silence at the beginning and end\n", - " - Audio segments should not contain long silences inside\n", - "\n", - "After obtaining the speech data and splitting into training, validation, and test sections, it is required to construct .json files to tell NeMo where to find these audio files.\n", - "\n", - "The .json files should adhere to the format required by the `nemo.collections.asr.data.audio_to_text.AudioToCharDataset` class. For example, here is a sample .json file\n", - "\n", - "```json\n", - "{\"audio_filepath\": \"/path/to/audio1.wav\", \"text\": \"the transcription\", \"duration\": 0.82}\n", - "{\"audio_filepath\": \"/path/to/audio2.wav\", \"text\": \"the other transcription\", \"duration\": 2.1}\n", - "...\n", - "```\n", - "Please note that the duration is in seconds.\n", - "\n", - "Lastly, update the labels inside the Tacotron 2 yaml config if your data contains a different set of characters.\n", - "\n", - "Then you are ready to run your training script:\n", - "```bash\n", - "python tacotron2.py train_dataset=YOUR_TRAIN.json validation_datasets=YOUR_VAL.json trainer.devices=-1\n", - "```" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Taco2.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "htbJiaJjYQAD" + }, + "source": [ + "# Tacotron 2 Training\n", + "\n", + "This notebook is designed to provide a guide on how to train Tacotron2 as part of the TTS pipeline. It contains the following sections\n", + "\n", + " 1. Tacotron2 and NeMo - An introduction to the Tacotron2 model\n", + " 2. LJSpeech - How to train Tacotron2 on LJSpeech\n", + " 3. Custom Datasets - How to collect audio data to train Tacotron2 for difference voices and languages" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wqPMTEXXYUP4" + }, + "source": [ + "# License\n", + "\n", + "> Copyright 2020 NVIDIA. All Rights Reserved.\n", + "> \n", + "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "> you may not use this file except in compliance with the License.\n", + "> You may obtain a copy of the License at\n", + "> \n", + "> http://www.apache.org/licenses/LICENSE-2.0\n", + "> \n", + "> Unless required by applicable law or agreed to in writing, software\n", + "> distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "> See the License for the specific language governing permissions and\n", + "> limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SUkq9HAvYU7T" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can either run this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies# .\n", + "\"\"\"\n", + "BRANCH = 'main'\n", + "# # If you're using Colab and not running locally, uncomment and run this cell.\n", + "# !apt-get install sox libsndfile1 ffmpeg\n", + "# !pip install wget unidecode\n", + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZivXzmq0YYLj" + }, + "source": [ + "# Tacotron2 and NeMo\n", + "\n", + "Tacotron2 is a neural network that converts text characters into a mel spectrogram. For more details on the model, please refer to Nvidia's [Tacotron2 Model Card](https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_tacotron2), or the original [paper](https://arxiv.org/abs/1712.05884).\n", + "\n", + "Tacotron2 like most NeMo models are defined as a LightningModule, allowing for easy training via PyTorch Lightning, and parameterized by a configuration, currently defined via a yaml file and loading using Hydra.\n", + "\n", + "Let's take a look using NeMo's pretrained model and how to use it to generate spectrograms." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HEvdSU5WYZbj" + }, + "outputs": [], + "source": [ + "# Load the Tacotron2Model\n", + "from nemo.collections.tts.models import Tacotron2Model\n", + "from nemo.collections.tts.models.base import SpectrogramGenerator\n", + "\n", + "# Let's see what pretrained models are available\n", + "print(Tacotron2Model.list_available_models())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3W8unatgYbUp" + }, + "outputs": [], + "source": [ + "# We can load the pre-trained model as follows\n", + "model = Tacotron2Model.from_pretrained(\"tts_en_tacotron2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xsyBa9tIdHp4" + }, + "outputs": [], + "source": [ + "# Tacotron2 is a SpectrogramGenerator\n", + "assert isinstance(model, SpectrogramGenerator)\n", + "\n", + "# SpectrogramGenerators in NeMo have two helper functions:\n", + "# 1. parse(self, text: str, normalize=True) which takes an English string and produces a token tensor\n", + "# 2. generate_spectrogram(self, *, tokens) which takes the token tensor and generates a spectrogram\n", + "# Let's try it out\n", + "tokens = model.parse(text = \"Hey, this produces speech!\")\n", + "spectrogram = model.generate_spectrogram(tokens = tokens)\n", + "\n", + "# Now we can visualize the generated spectrogram\n", + "# If we want to generate speech, we have to use a vocoder in conjunction to a spectrogram generator.\n", + "# Refer to the TTS Inference notebook on how to convert spectrograms to speech.\n", + "from matplotlib.pyplot import imshow\n", + "from matplotlib import pyplot as plt\n", + "%matplotlib inline\n", + "imshow(spectrogram.cpu().detach().numpy()[0,...], origin=\"lower\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zZ90eCfdrNIf" + }, + "source": [ + "# Training\n", + "\n", + "Now that we looked at the Tacotron2 model, let's see how to train a Tacotron2 Model\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7rHG-LERrPRY" + }, + "outputs": [], + "source": [ + "# NeMo's training scripts are stored inside the examples/ folder. Let's grab the tacotron2.py file\n", + "# as well as the tacotron2.yaml file\n", + "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/tacotron2.py\n", + "!mkdir -p conf && cd conf && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/tacotron2.yaml && cd .." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Upv_LxBIsC51" + }, + "source": [ + "Let's take a look at the tacotron2.py file\n", + "\n", + "```python\n", + "import pytorch_lightning as pl\n", + "\n", + "from nemo.collections.common.callbacks import LogEpochTimeCallback\n", + "from nemo.collections.tts.models import Tacotron2Model\n", + "from nemo.core.config import hydra_runner\n", + "from nemo.utils.exp_manager import exp_manager\n", + "\n", + "\n", + "# hydra_runner is a thin NeMo wrapper around Hydra\n", + "# It looks for a config named tacotron2.yaml inside the conf folder\n", + "# Hydra parses the yaml and returns it as a Omegaconf DictConfig\n", + "@hydra_runner(config_path=\"conf\", config_name=\"tacotron2\")\n", + "def main(cfg):\n", + " # Define the Lightning trainer\n", + " trainer = pl.Trainer(**cfg.trainer)\n", + " # exp_manager is a NeMo construct that helps with logging and checkpointing\n", + " exp_manager(trainer, cfg.get(\"exp_manager\", None))\n", + " # Define the Tacotron 2 model, this will construct the model as well as\n", + " # define the training and validation dataloaders\n", + " model = Tacotron2Model(cfg=cfg.model, trainer=trainer)\n", + " # Let's add a few more callbacks\n", + " lr_logger = pl.callbacks.LearningRateMonitor()\n", + " epoch_time_logger = LogEpochTimeCallback()\n", + " trainer.callbacks.extend([lr_logger, epoch_time_logger])\n", + " # Call lightning trainer's fit() to train the model\n", + " trainer.fit(model)\n", + "\n", + "\n", + "if __name__ == '__main__':\n", + " main() # noqa pylint: disable=no-value-for-parameter\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6nM-fZO-s75u" + }, + "source": [ + "Let's take a look at the yaml config\n", + "\n", + "```yaml\n", + "name: &name Tacotron2\n", + "\n", + "train_dataset: ???\n", + "validation_datasets: ???\n", + "sup_data_path: null\n", + "sup_data_types: null\n", + "\n", + "phoneme_dict_path: \"scripts/tts_dataset_files/cmudict-0.7b_nv22.01\"\n", + "heteronyms_path: \"scripts/tts_dataset_files/heteronyms-030921\"\n", + "whitelist_path: \"nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv\"\n", + "```\n", + "\n", + "The first part of the yaml defines dataset parameters used by Tacotron. Then in the head of 'model' section there are processing - related parameters. You can see\n", + "that the sample rate is set to 22050 for LJSpeech. \n", + "\n", + "Looking at the yaml, there is `train_dataset: ???` and `validation_datasets: ???`. The ??? indicates to hydra that these values must be passed via the command line or the script will fail.\n", + "\n", + "Looking further down the yaml, we get to the pytorch lightning trainer parameters.\n", + "\n", + "```yaml\n", + "trainer:\n", + " devices: 1 # number of gpus\n", + " accelerator: 'gpu' \n", + " max_epochs: ???\n", + " num_nodes: 1\n", + " accelerator: 'gpu'\n", + " strategy: 'ddp'\n", + " accumulate_grad_batches: 1\n", + " enable_checkpointing: False # Provided by exp_manager\n", + " logger: False # Provided by exp_manager\n", + " gradient_clip_val: 1.0\n", + " log_every_n_steps: 200\n", + " check_val_every_n_epoch: 25\n", + "```\n", + "\n", + "These values can be changed either by editing the yaml or through the command line.\n", + "\n", + "Let's grab some simple audio data and test Tacotron2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GnEzODcorugt" + }, + "outputs": [], + "source": [ + "!wget https://github.com/NVIDIA/NeMo/releases/download/v0.11.0/test_data.tar.gz && mkdir -p tests/data && tar xzf test_data.tar.gz -C tests/data\n", + "\n", + "# Just like ASR, the Tacotron2 require .json files to define the training and validation data.\n", + "!cat tests/data/asr/an4_val.json\n", + "\n", + "# Now that we have some sample data, we can try training Tacotron 2\n", + "# NOTE: The sample data is not enough data to properly train a Tacotron 2. This will not result in a trained Tacotron 2 and is used to illustrate how to train Tacotron 2 model\n", + "!python tacotron2.py sample_rate=16000 train_dataset=tests/data/asr/an4_train.json validation_datasets=tests/data/asr/an4_val.json trainer.max_epochs=3 trainer.accelerator=null trainer.check_val_every_n_epoch=1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9erGDGZJ1H_p" + }, + "source": [ + "# Training Data\n", + "\n", + "In order to train Tacotron2, it is highly recommended to obtain high quality speech data with the following properties:\n", + " - Sampling rate of 22050Hz or higher\n", + " - Single speaker\n", + " - Speech should contain a variety of speech phonemes\n", + " - Audio split into segments of 1-10 seconds\n", + " - Audio segments should not have silence at the beginning and end\n", + " - Audio segments should not contain long silences inside\n", + "\n", + "After obtaining the speech data and splitting into training, validation, and test sections, it is required to construct .json files to tell NeMo where to find these audio files.\n", + "\n", + "The .json files should adhere to the format required by the `nemo.collections.tts.torch.data.TTSDataset` class. For example, here is a sample .json file\n", + "\n", + "```json\n", + "{\"audio_filepath\": \"/path/to/audio1.wav\", \"text\": \"the transcription\", \"duration\": 0.82}\n", + "{\"audio_filepath\": \"/path/to/audio2.wav\", \"text\": \"the other transcription\", \"duration\": 2.1}\n", + "...\n", + "```\n", + "Please note that the duration is in seconds.\n", + "\n", + "\n", + "Then you are ready to run your training script:\n", + "```bash\n", + "python tacotron2.py train_dataset=YOUR_TRAIN.json validation_datasets=YOUR_VAL.json trainer.devices=-1\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "Taco2.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 } \ No newline at end of file