[TTS][ZH] add fastpitch and hifigan model NGC urls and update NeMo do…

…cs. (#5596) * [TTS][ZH] add fastpitch and hifigan model NGC urls and update NeMo docs. Signed-off-by: Xuesong Yang <[email protected]>
NVIDIA · Dec 14, 2022 · 7104f93 · 7104f93
1 parent 9982920
commit 7104f93
Show file tree

Hide file tree

Showing 7 changed files with 55 additions and 9 deletions.
diff --git a/docs/source/tts/data/datasets.csv b/docs/source/tts/data/datasets.csv
@@ -9,4 +9,5 @@ Spanish,es-CL,Crowdsourced high-quality Chilean Spanish,31,13,18,7.15,2.84,4.31,
 Spanish,es-CO,Crowdsourced high-quality Colombian Spanish,33,16,17,7.58,3.74,3.84,"48,000Hz",https://www.openslr.org/72/
 Spanish,es-PE,Crowdsourced high-quality Peruvian Spanish,38,18,20,9.22,4.35,4.87,"48,000Hz",https://www.openslr.org/73/
 Spanish,es-PR,Crowdsourced high-quality Puerto Rico Spanish,5,5,0,1.00,1.00,0.00,"48,000Hz",https://www.openslr.org/74/
-Spanish,es-VE,Crowdsourced high-quality Venezuelan Spanish,23,11,12,4.81,2.41,2.40,"48,000Hz",https://www.openslr.org/75/
+Spanish,es-VE,Crowdsourced high-quality Venezuelan Spanish,23,11,12,4.81,2.41,2.40,"48,000Hz",https://www.openslr.org/75/
+Chinese,zh-CN,SFSpeech Chinese/English Bilingual Speech,1,1,0,4.50,4.50,0.00,"22,050Hz",https://catalog.ngc.nvidia.com/orgs/nvidia/resources/sf_bilingual_speech_zh_en
diff --git a/docs/source/tts/data/ngc_models_aligner.csv b/docs/source/tts/data/ngc_models_aligner.csv
@@ -1,3 +1,3 @@
 Locale,Model Name,Dataset,Sampling Rate,#Spk,Phoneme Unit,Model Class,Overview,Checkpoint
 en-US,tts_en_radtts_aligner,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.aligner.AlignerModel,`tts_en_radtts_aligner <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_radtts_aligner>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/ARPABET_1.11.0/files/Aligner.nemo``
-en-US,tts_en_radtts_aligner_ipa,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.aligner.AlignerModel,`tts_en_radtts_aligner <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_radtts_aligner>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/IPA_1.13.0/files/Aligner.nemo``
+en-US,tts_en_radtts_aligner_ipa,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.aligner.AlignerModel,`tts_en_radtts_aligner <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_radtts_aligner>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/IPA_1.13.0/files/Aligner.nemo``
diff --git a/docs/source/tts/data/ngc_models_am.csv b/docs/source/tts/data/ngc_models_am.csv
@@ -8,4 +8,5 @@ en-US,RAD-TTS,TBD,TBD,TBD,ARPABET,nemo.collections.tts.models.radtts.RadTTSModel
 en-US,tts_en_tacotron2,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.tacotron2.Tacotron2Model,`tts_en_tacotron2 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_tacotron2>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_tacotron2/versions/1.0.0/files/tts_en_tacotron2.nemo``
 de-DE,tts_de_fastpitch_multispeaker_5,HUI Audio Corpus German,44100Hz,5,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitch_multispeaker_5 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_fastpitch_multispeaker_5.nemo``
 de-DE,tts_de_fastpitch_singlespeaker,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_fastpitch_align.nemo``
-es,tts_es_fastpitch_multispeaker,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,grapheme,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_fastpitch_multispeaker.nemo``
+es,tts_es_fastpitch_multispeaker,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,grapheme,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_fastpitch_multispeaker.nemo``
+zh-CN	,tts_zh_fastpitch_sfspeech,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,pinyin,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_fastpitch_sfspeech.nemo``
diff --git a/docs/source/tts/data/ngc_models_vocoder.csv b/docs/source/tts/data/ngc_models_vocoder.csv
@@ -9,4 +9,5 @@ en-US,tts_waveglow_88m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.t
 en-US,tts_waveglow_268m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.waveglow.WaveGlowModel,`tts_waveglow_268m <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_waveglow_268m>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_waveglow_268m/versions/1.0.0rc1/files/tts_waveglow_268m.nemo``
 de-DE,tts_de_hui_hifigan_ft_fastpitch_multispeaker_5,FastPitch,HUI Audio Corpus German,44100Hz,5,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitch_multispeaker_5 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_hui_hifigan_ft_fastpitch_multispeaker_5.nemo``
 de-DE,tts_de_slr_hifigan_ft_fastpitch_singlespeaker,FastPitch,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_hifigan.nemo``
-es,tts_es_hifigan_ft_fastpitch_multispeaker,FastPitch,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemo``
+es,tts_es_hifigan_ft_fastpitch_multispeaker,FastPitch,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemo``
+zh-CN	,tts_zh_hifigan_sfspeech,FastPitch,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_hifigan_sfspeech.nemo``
diff --git a/docs/source/tts/datasets.rst b/docs/source/tts/datasets.rst
@@ -1,7 +1,7 @@
 Data Preprocessing
 ==================
 
-NeMo TTS recipes support most of public TTS datasets that consist of multiple languages, multiple emotions, and multiple speakers. Current recipes covered English (en-US), German (de-DE), Spanish (es-ES), and Mandarin Chinese (work in progress), while the support for many other languages is under planning. NeMo provides corpus-specific data preprocessing scripts, as shown in the directory of `scripts/data_processing/tts/ <https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/>`_, to convert common public TTS datasets into the format expected by the dataloaders as defined in `nemo/collections/tts/torch/data.py <https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/tts/torch/data.py>`_. The ``nemo_tts`` collection expects each dataset to consist of a set of utterances in individual audio files plus a ``JSON`` manifest that describes the dataset, with information about one utterance per line. The audio files can be of any format supported by `Pydub <https://github.com/jiaaro/pydub>`_, though we recommend ``WAV`` files as they are the default and have been most thoroughly tested.
+NeMo TTS recipes support most of public TTS datasets that consist of multiple languages, multiple emotions, and multiple speakers. Current recipes covered English (en-US), German (de-DE), Spanish (es-ES), and Mandarin Chinese (zh-CN), while the support for many other languages is under planning. NeMo provides corpus-specific data preprocessing scripts, as shown in the directory of `scripts/data_processing/tts/ <https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/>`_, to convert common public TTS datasets into the format expected by the dataloaders as defined in `nemo/collections/tts/torch/data.py <https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/tts/torch/data.py>`_. The ``nemo_tts`` collection expects each dataset to consist of a set of utterances in individual audio files plus a ``JSON`` manifest that describes the dataset, with information about one utterance per line. The audio files can be of any format supported by `Pydub <https://github.com/jiaaro/pydub>`_, though we recommend ``WAV`` files as they are the default and have been most thoroughly tested.
 
 There should be one ``JSON`` manifest file per dataset that will be passed in, therefore, if the user wants separate training and validation datasets, they should also have separate manifests. Otherwise, they will be loading validation data with their training data and vice versa. Each line of the manifest should be in the following format:
 
@@ -66,7 +66,9 @@ LibriTTS
 
     $ python scripts/dataset_processing/tts/libritts/get_data.py \
         --data-root <your_local_dataset_root> \
-        --data-sets dev_clean
+        --manifests-path <your_manifest_store_path> \
+        --val-size 0.01 \
+        --test-size 0.01
 
     $ python scripts/dataset_processing/tts/extract_sup_data.py \
         --config-path ljspeech/ds_conf \
@@ -88,19 +90,19 @@ The texts of this dataset has been normalized already. So there is no extra need
 Thorsten Müller (German Neutral-TTS dataset)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * Dataset URL: https://www.openslr.org/resources/95/
-* Dataset Processing Script: https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/openslr/get_data.py
+* Dataset Processing Script: https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/openslr_95/get_data.py
 * Command Line Instruction:
 
 .. code-block:: bash
 
-    $ python scripts/dataset_processing/tts/openslr/get_data.py \
+    $ python scripts/dataset_processing/tts/openslr_95/get_data.py \
         --data-root <your_local_dataset_root> \
         --val-size 0.1 \
         --test-size 0.2 \
         --seed-for-ds-split 100
 
     $ python scripts/dataset_processing/tts/extract_sup_data.py \
-        --config-path openslr/ds_conf \
+        --config-path openslr_95/ds_conf \
         --config-name ds_for_fastpitch_align.yaml \
         manifest_filepath=<your_path_to_train_manifest> \
         sup_data_path=<your_path_to_where_to_save_supplementary_data>
@@ -130,4 +132,25 @@ HUI Audio Corpus German
         --config-path hui_acg/ds_conf \
         --config-name ds_for_fastpitch_align.yaml \
         manifest_filepath=<your_path_to_train_manifest> \
+        sup_data_path=<your_path_to_where_to_save_supplementary_data>
+
+
+SFSpeech Chinese/English Bilingual Speech
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* Dataset URL: https://catalog.ngc.nvidia.com/orgs/nvidia/resources/sf_bilingual_speech_zh_en
+* Dataset Processing Script: https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/sfbilingual/get_data.py
+* Command Line Instruction:
+
+.. code-block:: bash
+
+    $ python scripts/dataset_processing/tts/sfbilingual/get_data.py \
+        --data-root <your_local_dataset_root> \
+        --val-size 0.1 \
+        --test-size 0.2 \
+        --seed-for-ds-split 100
+
+    $ python scripts/dataset_processing/tts/extract_sup_data.py \
+        --config-path sfbilingual/ds_conf \
+        --config-name ds_for_fastpitch_align.yaml \
+        manifest_filepath=<your_path_to_train_manifest> \
         sup_data_path=<your_path_to_where_to_save_supplementary_data>
diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
@@ -610,6 +610,15 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         )
         list_of_models.append(model)
 
+        # zh, single speaker, 22050Hz, SFSpeech Bilingual Chinese/English dataset
+        model = PretrainedModelInfo(
+            pretrained_model_name="tts_zh_fastpitch_sfspeech",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_fastpitch_sfspeech.nemo",
+            description="This model is trained on a single female speaker in SFSpeech Bilingual Chinese/English dataset sampled at 22050Hz and can be used to generate female Mandarin Chinese voices.",
+            class_=cls,
+        )
+        list_of_models.append(model)
+
         return list_of_models
 
     # Methods for model exportability

diff --git a/nemo/collections/tts/models/hifigan.py b/nemo/collections/tts/models/hifigan.py
@@ -402,6 +402,7 @@ def list_available_models(cls) -> 'Optional[Dict[str, str]]':
         )
         list_of_models.append(model)
 
+        # Spanish, multi-speaker, 44100 Hz, Latin American Spanish OpenSLR
         model = PretrainedModelInfo(
             pretrained_model_name="tts_es_hifigan_ft_fastpitch_multispeaker",
             location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.14.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemo",
@@ -413,6 +414,16 @@ def list_available_models(cls) -> 'Optional[Dict[str, str]]':
         )
         list_of_models.append(model)
 
+        # zh, single female speaker, 22050 Hz, SFSpeech Chinese/English Bilingual Dataset.
+        model = PretrainedModelInfo(
+            pretrained_model_name="tts_zh_hifigan_sfspeech",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_hifigan_sfspeech.nemo",
+            description="This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_hifitts_hifigan_ft_fastpitch` "
+            "by the mel-spectrograms generated from the FastPitch checkpoint `tts_zh_fastpitch_sfspeech`. This model "
+            "has been tested on generating female Mandarin Chinese voices.",
+            class_=cls,
+        )
+        list_of_models.append(model)
         return list_of_models
 
     def load_state_dict(self, state_dict, strict=True):