Skip to content

Commit

Permalink
[TTS][ZH] add fastpitch and hifigan model NGC urls and update NeMo do…
Browse files Browse the repository at this point in the history
…cs. (#5596)

* [TTS][ZH] add fastpitch and hifigan model NGC urls and update NeMo docs.

Signed-off-by: Xuesong Yang <[email protected]>
  • Loading branch information
XuesongYang authored Dec 14, 2022
1 parent 9982920 commit 7104f93
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 9 deletions.
3 changes: 2 additions & 1 deletion docs/source/tts/data/datasets.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ Spanish,es-CL,Crowdsourced high-quality Chilean Spanish,31,13,18,7.15,2.84,4.31,
Spanish,es-CO,Crowdsourced high-quality Colombian Spanish,33,16,17,7.58,3.74,3.84,"48,000Hz",https://www.openslr.org/72/
Spanish,es-PE,Crowdsourced high-quality Peruvian Spanish,38,18,20,9.22,4.35,4.87,"48,000Hz",https://www.openslr.org/73/
Spanish,es-PR,Crowdsourced high-quality Puerto Rico Spanish,5,5,0,1.00,1.00,0.00,"48,000Hz",https://www.openslr.org/74/
Spanish,es-VE,Crowdsourced high-quality Venezuelan Spanish,23,11,12,4.81,2.41,2.40,"48,000Hz",https://www.openslr.org/75/
Spanish,es-VE,Crowdsourced high-quality Venezuelan Spanish,23,11,12,4.81,2.41,2.40,"48,000Hz",https://www.openslr.org/75/
Chinese,zh-CN,SFSpeech Chinese/English Bilingual Speech,1,1,0,4.50,4.50,0.00,"22,050Hz",https://catalog.ngc.nvidia.com/orgs/nvidia/resources/sf_bilingual_speech_zh_en
2 changes: 1 addition & 1 deletion docs/source/tts/data/ngc_models_aligner.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Locale,Model Name,Dataset,Sampling Rate,#Spk,Phoneme Unit,Model Class,Overview,Checkpoint
en-US,tts_en_radtts_aligner,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.aligner.AlignerModel,`tts_en_radtts_aligner <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_radtts_aligner>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/ARPABET_1.11.0/files/Aligner.nemo``
en-US,tts_en_radtts_aligner_ipa,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.aligner.AlignerModel,`tts_en_radtts_aligner <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_radtts_aligner>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/IPA_1.13.0/files/Aligner.nemo``
en-US,tts_en_radtts_aligner_ipa,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.aligner.AlignerModel,`tts_en_radtts_aligner <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_radtts_aligner>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/IPA_1.13.0/files/Aligner.nemo``
3 changes: 2 additions & 1 deletion docs/source/tts/data/ngc_models_am.csv
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ en-US,RAD-TTS,TBD,TBD,TBD,ARPABET,nemo.collections.tts.models.radtts.RadTTSModel
en-US,tts_en_tacotron2,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.tacotron2.Tacotron2Model,`tts_en_tacotron2 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_tacotron2>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_tacotron2/versions/1.0.0/files/tts_en_tacotron2.nemo``
de-DE,tts_de_fastpitch_multispeaker_5,HUI Audio Corpus German,44100Hz,5,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitch_multispeaker_5 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_fastpitch_multispeaker_5.nemo``
de-DE,tts_de_fastpitch_singlespeaker,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_fastpitch_align.nemo``
es,tts_es_fastpitch_multispeaker,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,grapheme,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_fastpitch_multispeaker.nemo``
es,tts_es_fastpitch_multispeaker,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,grapheme,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_fastpitch_multispeaker.nemo``
zh-CN ,tts_zh_fastpitch_sfspeech,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,pinyin,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_fastpitch_sfspeech.nemo``
3 changes: 2 additions & 1 deletion docs/source/tts/data/ngc_models_vocoder.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ en-US,tts_waveglow_88m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.t
en-US,tts_waveglow_268m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.waveglow.WaveGlowModel,`tts_waveglow_268m <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_waveglow_268m>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_waveglow_268m/versions/1.0.0rc1/files/tts_waveglow_268m.nemo``
de-DE,tts_de_hui_hifigan_ft_fastpitch_multispeaker_5,FastPitch,HUI Audio Corpus German,44100Hz,5,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitch_multispeaker_5 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_hui_hifigan_ft_fastpitch_multispeaker_5.nemo``
de-DE,tts_de_slr_hifigan_ft_fastpitch_singlespeaker,FastPitch,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_hifigan.nemo``
es,tts_es_hifigan_ft_fastpitch_multispeaker,FastPitch,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemo``
es,tts_es_hifigan_ft_fastpitch_multispeaker,FastPitch,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemo``
zh-CN ,tts_zh_hifigan_sfspeech,FastPitch,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_hifigan_sfspeech.nemo``
33 changes: 28 additions & 5 deletions docs/source/tts/datasets.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Data Preprocessing
==================

NeMo TTS recipes support most of public TTS datasets that consist of multiple languages, multiple emotions, and multiple speakers. Current recipes covered English (en-US), German (de-DE), Spanish (es-ES), and Mandarin Chinese (work in progress), while the support for many other languages is under planning. NeMo provides corpus-specific data preprocessing scripts, as shown in the directory of `scripts/data_processing/tts/ <https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/>`_, to convert common public TTS datasets into the format expected by the dataloaders as defined in `nemo/collections/tts/torch/data.py <https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/tts/torch/data.py>`_. The ``nemo_tts`` collection expects each dataset to consist of a set of utterances in individual audio files plus a ``JSON`` manifest that describes the dataset, with information about one utterance per line. The audio files can be of any format supported by `Pydub <https://github.com/jiaaro/pydub>`_, though we recommend ``WAV`` files as they are the default and have been most thoroughly tested.
NeMo TTS recipes support most of public TTS datasets that consist of multiple languages, multiple emotions, and multiple speakers. Current recipes covered English (en-US), German (de-DE), Spanish (es-ES), and Mandarin Chinese (zh-CN), while the support for many other languages is under planning. NeMo provides corpus-specific data preprocessing scripts, as shown in the directory of `scripts/data_processing/tts/ <https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/>`_, to convert common public TTS datasets into the format expected by the dataloaders as defined in `nemo/collections/tts/torch/data.py <https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/tts/torch/data.py>`_. The ``nemo_tts`` collection expects each dataset to consist of a set of utterances in individual audio files plus a ``JSON`` manifest that describes the dataset, with information about one utterance per line. The audio files can be of any format supported by `Pydub <https://github.com/jiaaro/pydub>`_, though we recommend ``WAV`` files as they are the default and have been most thoroughly tested.

There should be one ``JSON`` manifest file per dataset that will be passed in, therefore, if the user wants separate training and validation datasets, they should also have separate manifests. Otherwise, they will be loading validation data with their training data and vice versa. Each line of the manifest should be in the following format:

Expand Down Expand Up @@ -66,7 +66,9 @@ LibriTTS
$ python scripts/dataset_processing/tts/libritts/get_data.py \
--data-root <your_local_dataset_root> \
--data-sets dev_clean
--manifests-path <your_manifest_store_path> \
--val-size 0.01 \
--test-size 0.01
$ python scripts/dataset_processing/tts/extract_sup_data.py \
--config-path ljspeech/ds_conf \
Expand All @@ -88,19 +90,19 @@ The texts of this dataset has been normalized already. So there is no extra need
Thorsten Müller (German Neutral-TTS dataset)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Dataset URL: https://www.openslr.org/resources/95/
* Dataset Processing Script: https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/openslr/get_data.py
* Dataset Processing Script: https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/openslr_95/get_data.py
* Command Line Instruction:

.. code-block:: bash
$ python scripts/dataset_processing/tts/openslr/get_data.py \
$ python scripts/dataset_processing/tts/openslr_95/get_data.py \
--data-root <your_local_dataset_root> \
--val-size 0.1 \
--test-size 0.2 \
--seed-for-ds-split 100
$ python scripts/dataset_processing/tts/extract_sup_data.py \
--config-path openslr/ds_conf \
--config-path openslr_95/ds_conf \
--config-name ds_for_fastpitch_align.yaml \
manifest_filepath=<your_path_to_train_manifest> \
sup_data_path=<your_path_to_where_to_save_supplementary_data>
Expand Down Expand Up @@ -130,4 +132,25 @@ HUI Audio Corpus German
--config-path hui_acg/ds_conf \
--config-name ds_for_fastpitch_align.yaml \
manifest_filepath=<your_path_to_train_manifest> \
sup_data_path=<your_path_to_where_to_save_supplementary_data>
SFSpeech Chinese/English Bilingual Speech
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Dataset URL: https://catalog.ngc.nvidia.com/orgs/nvidia/resources/sf_bilingual_speech_zh_en
* Dataset Processing Script: https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/sfbilingual/get_data.py
* Command Line Instruction:

.. code-block:: bash
$ python scripts/dataset_processing/tts/sfbilingual/get_data.py \
--data-root <your_local_dataset_root> \
--val-size 0.1 \
--test-size 0.2 \
--seed-for-ds-split 100
$ python scripts/dataset_processing/tts/extract_sup_data.py \
--config-path sfbilingual/ds_conf \
--config-name ds_for_fastpitch_align.yaml \
manifest_filepath=<your_path_to_train_manifest> \
sup_data_path=<your_path_to_where_to_save_supplementary_data>
9 changes: 9 additions & 0 deletions nemo/collections/tts/models/fastpitch.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,15 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
)
list_of_models.append(model)

# zh, single speaker, 22050Hz, SFSpeech Bilingual Chinese/English dataset
model = PretrainedModelInfo(
pretrained_model_name="tts_zh_fastpitch_sfspeech",
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_fastpitch_sfspeech.nemo",
description="This model is trained on a single female speaker in SFSpeech Bilingual Chinese/English dataset sampled at 22050Hz and can be used to generate female Mandarin Chinese voices.",
class_=cls,
)
list_of_models.append(model)

return list_of_models

# Methods for model exportability
Expand Down
11 changes: 11 additions & 0 deletions nemo/collections/tts/models/hifigan.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ def list_available_models(cls) -> 'Optional[Dict[str, str]]':
)
list_of_models.append(model)

# Spanish, multi-speaker, 44100 Hz, Latin American Spanish OpenSLR
model = PretrainedModelInfo(
pretrained_model_name="tts_es_hifigan_ft_fastpitch_multispeaker",
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemo",
Expand All @@ -413,6 +414,16 @@ def list_available_models(cls) -> 'Optional[Dict[str, str]]':
)
list_of_models.append(model)

# zh, single female speaker, 22050 Hz, SFSpeech Chinese/English Bilingual Dataset.
model = PretrainedModelInfo(
pretrained_model_name="tts_zh_hifigan_sfspeech",
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_hifigan_sfspeech.nemo",
description="This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_hifitts_hifigan_ft_fastpitch` "
"by the mel-spectrograms generated from the FastPitch checkpoint `tts_zh_fastpitch_sfspeech`. This model "
"has been tested on generating female Mandarin Chinese voices.",
class_=cls,
)
list_of_models.append(model)
return list_of_models

def load_state_dict(self, state_dict, strict=True):
Expand Down

0 comments on commit 7104f93

Please sign in to comment.