From 1826b7355106b642bbfe2ee814d1a095d2565f4a Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Mon, 7 Nov 2022 10:26:25 -0800 Subject: [PATCH] [TTS] update organization of model checkpoints and their pointers. (#5327) * [TTS] update orgnization of model checkpoints and their pointers. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * move model name column to the 2nd col and correct model names as predefined_model_name. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- docs/source/conf.py | 2 + docs/source/tts/checkpoints.rst | 47 ++++++++++++++++++++- docs/source/tts/data/ngc_models_aligner.csv | 5 ++- docs/source/tts/data/ngc_models_am.csv | 19 +++++---- docs/source/tts/data/ngc_models_vocoder.csv | 21 ++++----- 5 files changed, 71 insertions(+), 23 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 236c4f8a8af6..0ec488174248 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -110,6 +110,8 @@ "sphinx.ext.autosectionlabel", "sphinxcontrib.bibtex", "sphinx_copybutton", + "IPython.sphinxext.ipython_console_highlighting", + "IPython.sphinxext.ipython_directive", ] bibtex_bibfiles = [ diff --git a/docs/source/tts/checkpoints.rst b/docs/source/tts/checkpoints.rst index 8afe7c9c783c..cbed0726d61f 100644 --- a/docs/source/tts/checkpoints.rst +++ b/docs/source/tts/checkpoints.rst @@ -34,7 +34,46 @@ The models can be accessed via the :code:`from_pretrained()` method inside the T import nemo.collections.tts as nemo_tts model = nemo_tts.models..from_pretrained(model_name="") -where ```` is generally the basename of the "Model Card" entry in the tables in :ref:`Checkpoints`. For example, the basename of the English FastPitch mel-generator model from https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_fastpitch is :code:`"tts_en_fastpitch"`. You could load this model by running, +where ```` is the value in ``Model Name`` column in the tables in :ref:`Checkpoints`. These names are predefined in the each model's member function ``self.list_available_models()``. For example, the available NGC FastPitch model names can be found, + +.. ipython:: + + In [1]: import nemo.collections.tts as nemo_tts + + In [2]: nemo_tts.models.FastPitchModel.list_available_models() + Out[2]: + [PretrainedModelInfo( + pretrained_model_name=tts_en_fastpitch, + description=This model is trained on LJSpeech sampled at 22050Hz with and can be used to generate female English voices with an American accent. It is ARPABET-based., + location=https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/1.8.1/files/tts_en_fastpitch_align.nemo, + class_= + ), + PretrainedModelInfo( + pretrained_model_name=tts_en_fastpitch_ipa, + description=This model is trained on LJSpeech sampled at 22050Hz with and can be used to generate female English voices with an American accent. It is IPA-based., + location=https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/IPA_1.13.0/files/tts_en_fastpitch_align_ipa.nemo, + class_= + ), + PretrainedModelInfo( + pretrained_model_name=tts_en_fastpitch_multispeaker, + description=This model is trained on HiFITTS sampled at 44100Hz with and can be used to generate male and female English voices with an American accent., + location=https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_multispeaker_fastpitchhifigan/versions/1.10.0/files/tts_en_fastpitch_multispeaker.nemo, + class_= + ), + PretrainedModelInfo( + pretrained_model_name=tts_de_fastpitch_singlespeaker, + description=This model is trained on a single male speaker data in OpenSLR Neutral German Dataset sampled at 22050Hz and can be used to generate male German voices., + location=https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_fastpitch_align.nemo, + class_= + ), + PretrainedModelInfo( + pretrained_model_name=tts_de_fastpitch_multispeaker_5, + description=This model is trained on 5 speakers in HUI-Audio-Corpus-German clean subset sampled at 44100Hz with and can be used to generate male and female German voices., + location=https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_fastpitch_multispeaker_5.nemo, + class_= + )] + +From the above key-value pair ``pretrained_model_name=tts_en_fastpitch``, you could get the model name ``tts_en_fastpitch`` and load it by running, .. code-block:: python @@ -79,7 +118,11 @@ There are multiple TTS tutorials provided in the directory of `tutorials/tts/ `_. +This section summarizes a full list of available NeMo TTS models that have been released in `NGC NeMo Text to Speech Collection `_. You can download model checkpoints of your interest via either way below, + +* :code:`wget ''` +* :code:`curl -LO ''` + Speech/Text Aligners ^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/tts/data/ngc_models_aligner.csv b/docs/source/tts/data/ngc_models_aligner.csv index 4831322a8520..6860af0e55cf 100644 --- a/docs/source/tts/data/ngc_models_aligner.csv +++ b/docs/source/tts/data/ngc_models_aligner.csv @@ -1,2 +1,3 @@ -Locale,Dataset,Sampling Rate,#Speakers,Model Name,Model Class,Model Card -en-US,LJSpeech,22050Hz,Single,RAD-TTS Aligner,nemo.collections.tts.models.aligner.AlignerModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_radtts_aligner \ No newline at end of file +Locale,Model Name,Dataset,Sampling Rate,#Spk,Phoneme Unit,Model Class,Overview,Checkpoint +en-US,tts_en_radtts_aligner,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.aligner.AlignerModel,`tts_en_radtts_aligner `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/ARPABET_1.11.0/files/Aligner.nemo`` +en-US,tts_en_radtts_aligner_ipa,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.aligner.AlignerModel,`tts_en_radtts_aligner `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/IPA_1.13.0/files/Aligner.nemo`` \ No newline at end of file diff --git a/docs/source/tts/data/ngc_models_am.csv b/docs/source/tts/data/ngc_models_am.csv index fbc7dd8ea17f..ee42f9806e3d 100644 --- a/docs/source/tts/data/ngc_models_am.csv +++ b/docs/source/tts/data/ngc_models_am.csv @@ -1,9 +1,10 @@ -Locale,Dataset,Sampling Rate,#Speakers,Model Name,Model Base Class,Model Card -en-US,LJSpeech,22050Hz,1,Tacotron2,nemo.collections.tts.models.tacotron2.Tacotron2Model,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_tacotron2 -en-US,LJSpeech,22050Hz,1,FastPitch,nemo.collections.tts.models.fastpitch.FastPitchModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_fastpitch -en-US,LJSpeech,22050Hz,1,Mixer-TTS,nemo.collections.tts.models.mixer_tts.MixerTTSModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_lj_mixertts -en-US,LJSpeech,22050Hz,1,Mixer-TTS-X,nemo.collections.tts.models.mixer_tts.MixerTTSModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_lj_mixerttsx -en-US,HiFiTTS,44100Hz,10,FastPitch-HiFiTTS,nemo.collections.tts.models.fastpitch.FastPitchModel,https://ngc.nvidia.com/models/nvidia:nemo:tts_en_multispeaker_fastpitchhifigan -en-US,TBD,TBD,TBD,RAD-TTS,nemo.collections.tts.models.radtts.RadTTSModel,TBD -de-DE,HUI Audio Corpus German,44100Hz,5,FastPitch,nemo.collections.tts.models.fastpitch.FastPitchModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5 -de-DE,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,FastPitch,nemo.collections.tts.models.fastpitch.FastPitchModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan \ No newline at end of file +Locale,Model Name,Dataset,Sampling Rate,#Spk,Phoneme Unit,Model Class,Overview,Checkpoint +en-US,tts_en_fastpitch,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_en_fastpitch `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/1.8.1/files/tts_en_fastpitch_align.nemo`` +en-US,tts_en_fastpitch_ipa,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_en_fastpitch `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/IPA_1.13.0/files/tts_en_fastpitch_align_ipa.nemo`` +en-US,tts_en_fastpitch_multispeaker,HiFiTTS,44100Hz,10,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_en_multispeaker_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_multispeaker_fastpitchhifigan/versions/1.10.0/files/tts_en_fastpitch_multispeaker.nemo`` +en-US,tts_en_lj_mixertts,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.mixer_tts.MixerTTSModel,`tts_en_lj_mixertts `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_mixertts/versions/1.6.0/files/tts_en_lj_mixertts.nemo`` +en-US,tts_en_lj_mixerttsx,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.mixer_tts.MixerTTSModel,`tts_en_lj_mixerttsx `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_mixerttsx/versions/1.6.0/files/tts_en_lj_mixerttsx.nemo`` +en-US,RAD-TTS,TBD,TBD,TBD,ARPABET,nemo.collections.tts.models.radtts.RadTTSModel,TBD, +en-US,tts_en_tacotron2,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.tacotron2.Tacotron2Model,`tts_en_tacotron2 `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_tacotron2/versions/1.0.0/files/tts_en_tacotron2.nemo`` +de-DE,tts_de_fastpitch_multispeaker_5,HUI Audio Corpus German,44100Hz,5,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitch_multispeaker_5 `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_fastpitch_multispeaker_5.nemo`` +de-DE,tts_de_fastpitch_singlespeaker,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_fastpitch_align.nemo`` \ No newline at end of file diff --git a/docs/source/tts/data/ngc_models_vocoder.csv b/docs/source/tts/data/ngc_models_vocoder.csv index f0b11f58d56c..e38e863b2c0e 100644 --- a/docs/source/tts/data/ngc_models_vocoder.csv +++ b/docs/source/tts/data/ngc_models_vocoder.csv @@ -1,10 +1,11 @@ -Locale,Dataset,Sampling Rate,#Speakers,Model Name,Model Base Class,Model Card -en-US,LJSpeech,22050Hz,Single,WaveGlow-88m,nemo.collections.tts.models.waveglow.WaveGlowModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_waveglow_88m -en-US,LJSpeech,22050Hz,Single,WaveGlow-268m,nemo.collections.tts.models.waveglow.WaveGlowModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_waveglow_268m -en-US,LJSpeech,22050Hz,Single,HiFiGAN-LJSpeech-1.0.0rc1,nemo.collections.tts.models.hifigan.HifiGanModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_hifigan -en-US,LJSpeech,22050Hz,Single,HiFiGAN-LJSpeech-1.6.0,nemo.collections.tts.models.hifigan.HifiGanModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_lj_hifigan -en-US,HiFiTTS,44100Hz,Multiple,HiFiGAN-HiFiTTS,nemo.collections.tts.models.hifigan.HifiGanModel,https://ngc.nvidia.com/models/nvidia:nemo:tts_en_multispeaker_fastpitchhifigan -en-US,LJSpeech,22050Hz,Single,UnivNet-LJSpeech,nemo.collections.tts.models.univnet.UnivNetModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_lj_univnet -en-US,LibriTTS,24000Hz,Single,UnivNet-LibriTTS,nemo.collections.tts.models.univnet.UnivNetModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_libritts_univnet -de-DE,HUI Audio Corpus German,44100Hz,Multiple,HiFiGAN-HUI,nemo.collections.tts.models.hifigan.HifiGanModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5 -de-DE,Thorsten Müller (German Neutral-TTS dataset),22050Hz,Single,HiFiGAN-Thorsten,nemo.collections.tts.models.hifigan.HifiGanModel,https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan \ No newline at end of file +Locale,Model Name,Spectrogram Generator,Dataset,Sampling Rate,#Spk,Model Class,Overview,Checkpoint +en-US,tts_hifigan,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_hifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_hifigan/versions/1.0.0rc1/files/tts_hifigan.nemo`` +en-US,tts_en_lj_hifigan_ft_mixertts,Mixer-TTS,LJSpeech,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_en_lj_hifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_hifigan/versions/1.6.0/files/tts_en_lj_hifigan_ft_mixertts.nemo`` +en-US,tts_en_lj_hifigan_ft_mixerttsx,Mixer-TTS-X,LJSpeech,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_en_lj_hifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_hifigan/versions/1.6.0/files/tts_en_lj_hifigan_ft_mixerttsx.nemo`` +en-US,tts_en_hifitts_hifigan_ft_fastpitch,FastPitch,HiFiTTS,44100Hz,10,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_en_multispeaker_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_multispeaker_fastpitchhifigan/versions/1.10.0/files/tts_en_hifitts_hifigan_ft_fastpitch.nemo`` +en-US,tts_en_lj_univnet,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.univnet.UnivNetModel,`tts_en_lj_univnet `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_univnet/versions/1.7.0/files/tts_en_lj_univnet.nemo`` +en-US,tts_en_libritts_univnet,librosa.filters.mel,LibriTTS,24000Hz,1,nemo.collections.tts.models.univnet.UnivNetModel,`tts_en_libritts_univnet `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_libritts_univnet/versions/1.7.0/files/tts_en_libritts_multispeaker_univnet.nemo`` +en-US,tts_waveglow_88m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.waveglow.WaveGlowModel,`tts_waveglow_88m `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_waveglow_88m/versions/1.0.0/files/tts_waveglow.nemo`` +en-US,tts_waveglow_268m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.waveglow.WaveGlowModel,`tts_waveglow_268m `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_waveglow_268m/versions/1.0.0rc1/files/tts_waveglow_268m.nemo`` +de-DE,tts_de_hui_hifigan_ft_fastpitch_multispeaker_5,FastPitch,HUI Audio Corpus German,44100Hz,5,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitch_multispeaker_5 `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_hui_hifigan_ft_fastpitch_multispeaker_5.nemo`` +de-DE,tts_de_slr_hifigan_ft_fastpitch_singlespeaker,FastPitch,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_hifigan.nemo`` \ No newline at end of file