[TTS] expand to support flexible dictionary entry formats in IPAG2P. (N…

…VIDIA#5318) * expand to support flexible dictionary entry formats in IPAG2P. * removed unused imports in test.collections.tts * removed unused imports in nemo.collections.tts.modules * removed unused imports in nemo_text_processing.text_normalization.zh * updated unit tests with new cases * renamed test function names because we only test IPAG2P rather than all classes in the modules.py. * revise current test dict with a single space between word and pronunications. Signed-off-by: Xuesong Yang <[email protected]> Signed-off-by: andrusenkoau <[email protected]>
andrusenkoau · Jan 5, 2023 · 580ce4a · 580ce4a
1 parent 1be8732
commit 580ce4a
Show file tree

Hide file tree

Showing 9 changed files with 50 additions and 38 deletions.
diff --git a/nemo/collections/tts/modules/common.py b/nemo/collections/tts/modules/common.py
@@ -14,8 +14,7 @@
 
 ###############################################################################
 
-import ast
-from typing import List, Optional, Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 import torch

diff --git a/nemo/collections/tts/modules/radtts.py b/nemo/collections/tts/modules/radtts.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pdb
 
 ###############################################################################
 import torch
@@ -32,7 +31,6 @@
     LinearNorm,
     get_mask_from_lengths,
     getRadTTSEncoder,
-    sort_tensor,
 )
 from nemo.collections.tts.modules.submodules import PartialConv1d
 from nemo.core.classes import Exportable, NeuralModule

diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional, Tuple
+from typing import Optional, Tuple
 
 import torch
 from torch.autograd import Variable

diff --git a/nemo_text_processing/g2p/modules.py b/nemo_text_processing/g2p/modules.py
@@ -321,14 +321,14 @@ def __init__(
             # load the dictionary file where there may exist a digit suffix after a word, which
             # represents the pronunciation variant of that word.
             phoneme_dict_obj = defaultdict(list)
-            _alt_re = re.compile(r'\([0-9]+\)')
+            _alt_re = re.compile(r"\([0-9]+\)")
             with open(phoneme_dict, "r") as fdict:
                 for line in fdict:
                     if len(line) and ('A' <= line[0] <= 'Z' or line[0] == "'"):
-                        parts = line.strip().split("  ")
-                        assert len(parts) == 2, f"Wrong format for the entry: {line.strip()}."
-                        word = re.sub(_alt_re, '', parts[0])
-                        phoneme_dict_obj[word].append(list(parts[1]))
+                        parts = line.strip().split(maxsplit=1)
+                        word = re.sub(_alt_re, "", parts[0])
+                        prons = re.sub(r"\s+", "", parts[1])
+                        phoneme_dict_obj[word].append(list(prons))
         else:
             # Load phoneme_dict as dictionary object
             logging.info("Loading phoneme_dict as a Dict object.")

diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py
@@ -14,7 +14,7 @@
 import os
 
 import pynini
-from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_SIGMA, GraphFst
+from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst
 from nemo_text_processing.text_normalization.zh.taggers.cardinal import Cardinal
 from nemo_text_processing.text_normalization.zh.taggers.char import Char
 from nemo_text_processing.text_normalization.zh.taggers.date import Date

diff --git a/tests/collections/tts/test_torch_tts.py b/tests/collections/tts/test_torch_tts.py
@@ -16,7 +16,6 @@
 import os
 from pathlib import Path
 
-import numpy as np
 import pytest
 import torch
 from nemo_text_processing.g2p.modules import EnglishG2p

diff --git a/tests/collections/tts/test_waveglow.py b/tests/collections/tts/test_waveglow.py
@@ -14,15 +14,12 @@
 
 import os
 import tempfile
-from unittest import TestCase
 
-import onnx
 import pytest
 import torch
 from omegaconf import DictConfig
 
 from nemo.collections.tts.models import WaveGlowModel
-from nemo.collections.tts.modules import WaveGlowModule
 from nemo.core.classes import typecheck
 
 mcfg = DictConfig(

diff --git a/tests/nemo_text_processing/g2p/phoneme_dict/test_dict.txt b/tests/nemo_text_processing/g2p/phoneme_dict/test_dict.txt
@@ -1,2 +1,5 @@
 HELLO  həˈɫoʊ
-WORLD  ˈwɝɫd
+WORLD ˈwɝɫd
+LEAD  ˈlɛd
+LEAD(1) ˈ l i d
+NVIDIA    ɛ n ˈ v ɪ d i ə
diff --git a/tests/nemo_text_processing/g2p/test_modules.py b/tests/nemo_text_processing/g2p/test_modules.py
@@ -18,7 +18,7 @@
 from nemo_text_processing.g2p.modules import IPAG2P
 
 
-class TestModules:
+class TestIPAG2P:
 
     PHONEME_DICT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "phoneme_dict")
     PHONEME_DICT_PATH_DE = os.path.join(PHONEME_DICT_DIR, "test_dict_de.txt")
@@ -45,43 +45,59 @@ def _create_g2p(
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_parse_dict(self):
+    def test_normalize_dict_with_phonemes(self):
         # fmt: off
         expected_symbols = {
-            'h', 'ə', 'ˈ', 'ɫ', 'o', 'ʊ', 'ˈ',
-            'w', 'ɝ', 'ɫ', 'd'
+            'h', 'ə', 'ˈ', 'ɫ', 'o', 'ʊ',
+            'ˈ', 'w', 'ɝ', 'ɫ', 'd',
+            'ˈ', 'l', 'ɛ', 'd',
+            'ˈ', 'l', 'i', 'd',
+            'ɛ', 'n', 'ˈ', 'v', 'ɪ', 'd', 'i', 'ə'
         }
         # fmt: on
         g2p = self._create_g2p()
 
         assert expected_symbols == g2p.symbols
         assert len(g2p.phoneme_dict["HELLO"]) == 1
         assert len(g2p.phoneme_dict["WORLD"]) == 1
-        assert g2p.phoneme_dict["HELLO"][0] == [char for char in "həˈɫoʊ"]
-        assert g2p.phoneme_dict["WORLD"][0] == [char for char in "ˈwɝɫd"]
+        assert len(g2p.phoneme_dict["LEAD"]) == 2
+        assert len(g2p.phoneme_dict["NVIDIA"]) == 1
+        assert g2p.phoneme_dict["HELLO"][0] == list("həˈɫoʊ")
+        assert g2p.phoneme_dict["WORLD"][0] == list("ˈwɝɫd")
+        assert g2p.phoneme_dict["LEAD"] == [list("ˈlɛd"), list("ˈlid")]
+        assert g2p.phoneme_dict["NVIDIA"][0] == list("ɛnˈvɪdiə")
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_parse_dict_with_chars(self):
+    def test_normalize_dict_with_graphemes_and_phonemes(self):
         # fmt: off
         expected_symbols = {
             'H', 'E', 'L', 'L', 'O',
             'W', 'O', 'R', 'L', 'D',
+            'L', 'E', 'A', 'D',
+            'N', 'V', 'I', 'D', 'I', 'A',
             'h', 'ə', 'ˈ', 'ɫ', 'o', 'ʊ',
-            'ˈ', 'w', 'ɝ', 'ɫ', 'd'
+            'ˈ', 'w', 'ɝ', 'ɫ', 'd',
+            'ˈ', 'l', 'ɛ', 'd',
+            'ˈ', 'l', 'i', 'd',
+            'ɛ', 'n', 'ˈ', 'v', 'ɪ', 'd', 'i', 'ə'
         }
         # fmt: on
         g2p = self._create_g2p(use_chars=True)
 
         assert expected_symbols == g2p.symbols
         assert len(g2p.phoneme_dict["HELLO"]) == 1
         assert len(g2p.phoneme_dict["WORLD"]) == 1
-        assert g2p.phoneme_dict["HELLO"][0] == [char for char in "həˈɫoʊ"]
-        assert g2p.phoneme_dict["WORLD"][0] == [char for char in "ˈwɝɫd"]
+        assert len(g2p.phoneme_dict["LEAD"]) == 2
+        assert len(g2p.phoneme_dict["NVIDIA"]) == 1
+        assert g2p.phoneme_dict["HELLO"][0] == list("həˈɫoʊ")
+        assert g2p.phoneme_dict["WORLD"][0] == list("ˈwɝɫd")
+        assert g2p.phoneme_dict["LEAD"] == [list("ˈlɛd"), list("ˈlid")]
+        assert g2p.phoneme_dict["NVIDIA"][0] == list("ɛnˈvɪdiə")
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p(self):
+    def test_forward_call(self):
         input_text = "Hello world."
         expected_output = [char for char in "həˈɫoʊ ˈwɝɫd."]
         g2p = self._create_g2p()
@@ -91,11 +107,11 @@ def test_ipa_g2p(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_with_dict_input(self):
+    def test_forward_call_with_file_or_object_dict_type(self):
         input_text = "Hello world."
         expected_output = [char for char in "həˈɫoʊ ˈwɝɫd."]
 
-        phoneme_dict = {"HELLO": ["həˈɫoʊ"], "WORLD": ["ˈwɝɫd"]}
+        phoneme_dict = {"HELLO": ["həˈɫoʊ"], "WORLD": ["ˈwɝɫd"], "LEAD": ["ˈlɛd", "ˈlid"], "NVIDIA": ["ɛnˈvɪdiə"]}
 
         g2p_file = self._create_g2p()
         g2p_dict = self._create_g2p(phoneme_dict=phoneme_dict)
@@ -107,7 +123,7 @@ def test_ipa_g2p_with_dict_input(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_with_oov(self):
+    def test_forward_call_with_oov_word(self):
         input_text = "Hello Kitty!"
         expected_output = [char for char in "həˈɫoʊ KITTY!"]
         g2p = self._create_g2p()
@@ -117,7 +133,7 @@ def test_ipa_g2p_with_oov(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_with_oov_func(self):
+    def test_forward_call_with_oov_func(self):
         input_text = "Hello Kitty!"
         expected_output = [char for char in "həˈɫoʊ test!"]
         g2p = self._create_g2p(apply_to_oov_word=lambda x: "test")
@@ -127,7 +143,7 @@ def test_ipa_g2p_with_oov_func(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_graphemes(self):
+    def test_forward_call_with_graphemes_uppercase(self):
         input_text = "Hello world."
         expected_output = [char for char in input_text.upper()]
         g2p = self._create_g2p(use_chars=True, phoneme_probability=0.0)
@@ -137,7 +153,7 @@ def test_ipa_g2p_graphemes(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_graphemes_lower(self):
+    def test_forward_call_with_graphemes_lowercase(self):
         input_text = "Hello world."
         expected_output = [char for char in input_text.lower()]
         g2p = self._create_g2p(use_chars=True, phoneme_probability=0.0, set_graphemes_upper=False)
@@ -147,7 +163,7 @@ def test_ipa_g2p_graphemes_lower(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_with_escaped_characters(self):
+    def test_forward_call_with_escaped_characters(self):
         input_text = "Hello |wo rld|."
         expected_output = ["h", "ə", "ˈ", "ɫ", "o", "ʊ", " ", "wo", "rld", "."]
         g2p = self._create_g2p()
@@ -157,13 +173,13 @@ def test_ipa_g2p_with_escaped_characters(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_unsupported_locale(self):
+    def test_instantiate_unsupported_locale(self):
         with pytest.raises(ValueError, match="Unsupported locale"):
             self._create_g2p(locale="en-USA")
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_de_de(self):
+    def test_forward_call_de_de(self):
         input_text = "Hallo „welt“!"
         expected_output = [char for char in "hˈaloː „vˈɛlt“!"]
         g2p = self._create_g2p(phoneme_dict=self.PHONEME_DICT_PATH_DE, locale="de-DE")
@@ -173,7 +189,7 @@ def test_ipa_g2p_de_de(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_en_us(self):
+    def test_forward_call_en_us(self):
         input_text = "Hello Kitty!"
         expected_output = [char for char in "həˈɫoʊ KITTY!"]
         g2p = self._create_g2p(locale="en-US")
@@ -183,7 +199,7 @@ def test_ipa_g2p_en_us(self):
 
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_ipa_g2p_es_es(self):
+    def test_forward_call_es_es(self):
         input_text = "¿Hola mundo, amigo?"
         expected_output = [char for char in "¿ˈola mˈundo, AMIGO?"]
         g2p = self._create_g2p(phoneme_dict=self.PHONEME_DICT_PATH_ES, locale="es-ES")