diff --git a/nemo/collections/common/tokenizers/text_to_speech/__init__.py b/nemo/collections/common/tokenizers/text_to_speech/__init__.py index b2671f283429..2db92b257416 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/__init__.py +++ b/nemo/collections/common/tokenizers/text_to_speech/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from nemo.collections.common.tokenizers.text_to_speech.tokenizer_wrapper import TextToSpeechTokenizer diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py index 48745a0da35c..e919339cce01 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py +++ b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py @@ -14,6 +14,9 @@ # fmt: off + +SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES"] + DEFAULT_PUNCTUATION = ( ',', '.', '!', '?', '-', ':', ';', '/', '"', '(', @@ -62,31 +65,48 @@ 'ʊ', 'ʌ', 'ʒ', '̃', 'θ' ) } + # fmt: on +def validate_locale(locale): + if locale not in SUPPORTED_LOCALES: + raise ValueError(f"Unsupported locale '{locale}'. " f"Supported locales {SUPPORTED_LOCALES}") + + def get_grapheme_character_set(locale): if locale not in GRAPHEME_CHARACTER_SETS: - raise ValueError(f"Grapheme character set not found for locale {locale}") + raise ValueError( + f"Grapheme character set not found for locale '{locale}'. " + f"Supported locales {GRAPHEME_CHARACTER_SETS.keys()}" + ) char_set = set(GRAPHEME_CHARACTER_SETS[locale]) return char_set def get_ipa_character_set(locale): if locale not in IPA_CHARACTER_SETS: - raise ValueError(f"IPA character set not found for locale {locale}") + raise ValueError( + f"IPA character set not found for locale '{locale}'. " f"Supported locales {IPA_CHARACTER_SETS.keys()}" + ) char_set = set(IPA_CHARACTER_SETS[locale]) return char_set def get_ipa_punctuation_list(locale): - punct_list = list(DEFAULT_PUNCTUATION) + if locale is None: + return sorted(list(DEFAULT_PUNCTUATION)) + + validate_locale(locale) + + punct_set = set(DEFAULT_PUNCTUATION) if locale in ["de-DE", "es-ES"]: # https://en.wikipedia.org/wiki/Guillemet#Uses - punct_list.extend(['«', '»', '‹', '›']) + punct_set.update(['«', '»', '‹', '›']) if locale == "de-DE": - punct_list.extend(['„', '“']) + punct_set.update(['„', '“']) elif locale == "es-ES": - punct_list.extend(['¿', '¡']) + punct_set.update(['¿', '¡']) + punct_list = sorted(list(punct_set)) return punct_list diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index c9b15ac41196..9bd473b3c7c2 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -23,10 +23,11 @@ chinese_text_preprocessing, english_text_preprocessing, german_text_preprocessing, + ipa_text_preprocessing, spanish_text_preprocessing, ) -from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import get_ipa_punctuation_list +from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import get_ipa_punctuation_list, validate_locale from nemo.utils import logging from nemo.utils.decorators import experimental @@ -500,17 +501,10 @@ def set_phone_prob(self, prob): @experimental class IPATokenizer(BaseTokenizer): - # fmt: off - PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally - ',', '.', '!', '?', '-', - ':', ';', '/', '"', '(', - ')', '[', ']', '{', '}', - ) - # fmt: on - def __init__( self, g2p, + locale="en-US", punct=True, non_default_punct_list=None, *, @@ -521,11 +515,13 @@ def __init__( sep='|', # To be able to distinguish between symbols add_blank_at=None, pad_with_space=False, - text_preprocessing_func=lambda text: english_text_preprocessing(text, lower=False), ): """General-purpose IPA-based tokenizer. Args: g2p: Grapheme to phoneme module, should be IPAG2P or some subclass thereof. + locale: Locale used to determine default text processing logic and punctuation. + Supports ["en-US", "de-DE", "es-ES"]. Defaults to "en-US". + Specify None if implementing custom logic for a new locale. punct: Whether to reserve grapheme for basic punctuation or not. non_default_punct_list: List of punctuation marks which will be used instead default, if any. space: Space token as string. @@ -534,12 +530,8 @@ def __init__( oov: OOV token as string. sep: Separation token as string. add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. + if None then no blank in labels. pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. - Basically, it replaces all non-unicode characters with unicode ones. - Note that lower() function shouldn't applied here, in case the text contains phonemes (it will be handled by g2p). - Defaults to English text preprocessing. """ if not hasattr(g2p, "symbols"): logging.error( @@ -549,6 +541,9 @@ def __init__( ) raise ValueError("G2P modules passed into the IPATokenizer must have `symbols` defined.") + if locale is not None: + validate_locale(locale) + self.phoneme_probability = None if hasattr(g2p, "phoneme_probability"): self.phoneme_probability = g2p.phoneme_probability @@ -561,8 +556,11 @@ def __init__( if punct: if non_default_punct_list is not None: - self.PUNCT_LIST = non_default_punct_list - tokens.update(self.PUNCT_LIST) + self.punct_list = non_default_punct_list + else: + self.punct_list = get_ipa_punctuation_list(locale) + + tokens.update(self.punct_list) # Sort to ensure that vocab is in the same order every time tokens = sorted(list(tokens)) @@ -580,9 +578,13 @@ def __init__( self.punct = punct self.pad_with_space = pad_with_space - self.text_preprocessing_func = text_preprocessing_func self.g2p = g2p + if locale == "en-US": + self.text_preprocessing_func = lambda text: english_text_preprocessing(text, lower=False) + else: + self.text_preprocessing_func = ipa_text_preprocessing + def encode(self, text): """See base class for more information.""" @@ -608,7 +610,7 @@ def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None): elif p in tokens: # Add next phoneme or char (if chars=True) ps.append(p) - elif (p in self.PUNCT_LIST) and self.punct: + elif (p in self.punct_list) and self.punct: # Add punct ps.append(p) elif p != space: diff --git a/nemo_text_processing/g2p/data/data_utils.py b/nemo_text_processing/g2p/data/data_utils.py index 1a41c184ac41..b91f533de858 100644 --- a/nemo_text_processing/g2p/data/data_utils.py +++ b/nemo_text_processing/g2p/data/data_utils.py @@ -115,7 +115,8 @@ def english_text_preprocessing(text, lower=True): def _word_tokenize(words): """ - Convert text (str) to List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation and flag whether to leave unchanged or not. + Convert text (str) to List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation and + flag whether to leave unchanged or not. Word can be one of: valid english word, any substring starts from | to | (unchangeable word) or punctuation marks. This function expects that unchangeable word is carefully divided by spaces (e.g. HH AH L OW). Unchangeable word will be splitted by space and represented as List[str], other cases are represented as str. @@ -146,6 +147,10 @@ def ipa_word_tokenize(text): return _word_tokenize(words) +def ipa_text_preprocessing(text): + return text.lower() + + def german_text_preprocessing(text): return text.lower() diff --git a/nemo_text_processing/g2p/modules.py b/nemo_text_processing/g2p/modules.py index 4a49c39c6f67..924fa9f3c716 100644 --- a/nemo_text_processing/g2p/modules.py +++ b/nemo_text_processing/g2p/modules.py @@ -23,8 +23,9 @@ import nltk import torch -from nemo_text_processing.g2p.data.data_utils import english_word_tokenize +from nemo_text_processing.g2p.data.data_utils import english_word_tokenize, ipa_word_tokenize +from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import validate_locale from nemo.utils import logging from nemo.utils.decorators import experimental from nemo.utils.get_rank import is_global_rank_zero @@ -263,7 +264,7 @@ class IPAG2P(BaseG2p): def __init__( self, phoneme_dict: Union[str, pathlib.Path, dict], - word_tokenize_func: Callable[[str], List[Tuple[Union[str, List[str]], bool]]] = english_word_tokenize, + locale: str = "en-US", apply_to_oov_word: Optional[Callable[[str], str]] = None, ignore_ambiguous_words: bool = True, heteronyms: Optional[Union[str, pathlib.Path, List[str]]] = None, @@ -281,13 +282,9 @@ def __init__( phoneme_dict (str, Path, Dict): Path to file in CMUdict format or a IPA dict object with CMUdict-like entries. a dictionary file example: scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt; a dictionary object example: {..., "WIRE": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...} - word_tokenize_func: Function for tokenizing text to words. - It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word - representation and flag whether to leave unchanged or not. - It is expected that unchangeable word representation will be represented as List[str], other - cases are represented as str. - It is useful to mark word as unchangeable which is already in phoneme representation. - Defaults to the English word tokenizer. + locale: Locale used to determine default tokenization logic. + Supports ["en-US", "de-DE", "es-ES"]. Defaults to "en-US". + Specify None if implementing custom logic for a new locale. apply_to_oov_word: Function that will be applied to out of phoneme_dict word. ignore_ambiguous_words: Whether to not handle word via phoneme_dict with ambiguous phoneme sequences. Defaults to True. @@ -307,6 +304,9 @@ def __init__( self.phoneme_probability = phoneme_probability self._rng = random.Random() + if locale is not None: + validate_locale(locale) + if not use_chars and self.phoneme_probability is not None: self.use_chars = True logging.warning( @@ -348,6 +348,14 @@ def __init__( "you may see unexpected deletions in your input." ) + # word_tokenize_func returns a List[Tuple[Union[str, List[str]], bool]] where every tuple denotes + # a word representation (word or list tokens) and a flag indicating whether to process the word or + # leave it unchanged. + if locale == "en-US": + word_tokenize_func = english_word_tokenize + else: + word_tokenize_func = ipa_word_tokenize + super().__init__( phoneme_dict=self.phoneme_dict, word_tokenize_func=word_tokenize_func, diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py index 2e56b0f50fa6..6077c78c6523 100644 --- a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py +++ b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py @@ -13,15 +13,27 @@ # limitations under the License. import pytest +from nemo_text_processing.g2p.modules import IPAG2P from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import ( EnglishCharsTokenizer, GermanCharsTokenizer, + IPATokenizer, SpanishCharsTokenizer, ) class TestTTSTokenizers: + PHONEME_DICT_DE = { + "HALLO": ["hˈaloː"], + "WELT": ["vˈɛlt"], + } + PHONEME_DICT_EN = {"HELLO": ["həˈɫoʊ"], "WORLD": ["ˈwɝɫd"], "CAFE": ["kəˈfeɪ"]} + PHONEME_DICT_ES = { + "BUENOS": ["bwˈenos"], + "DÍAS": ["dˈias"], + } + @staticmethod def _parse_text(tokenizer, text): tokens = tokenizer.encode(text) @@ -88,3 +100,60 @@ def test_spanish_chars_tokenizer(self): assert chars == expected_output assert len(tokens) == len(input_text) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_tokenizer(self): + input_text = "Hello world!" + expected_output = " həˈɫoʊ ˈwɝɫd! " + + g2p = IPAG2P(phoneme_dict=self.PHONEME_DICT_EN) + + tokenizer = IPATokenizer(g2p=g2p, locale=None, pad_with_space=True) + chars, tokens = self._parse_text(tokenizer, input_text) + + assert chars == expected_output + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_tokenizer_unsupported_locale(self): + g2p = IPAG2P(phoneme_dict=self.PHONEME_DICT_EN) + with pytest.raises(ValueError, match="Unsupported locale"): + IPATokenizer(g2p=g2p, locale="asdf") + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_tokenizer_de_de(self): + input_text = "Hallo welt" + expected_output = "hˈaloː vˈɛlt" + + g2p = IPAG2P(phoneme_dict=self.PHONEME_DICT_DE, locale="de-DE") + tokenizer = IPATokenizer(g2p=g2p, locale="de-DE") + chars, tokens = self._parse_text(tokenizer, input_text) + + assert chars == expected_output + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_tokenizer_en_us(self): + input_text = "Hello café." + expected_output = "həˈɫoʊ kəˈfeɪ." + g2p = IPAG2P(phoneme_dict=self.PHONEME_DICT_EN) + + tokenizer = IPATokenizer(g2p=g2p, locale="en-US") + tokenizer.tokens.extend("CAFE") + chars, tokens = self._parse_text(tokenizer, input_text) + + assert chars == expected_output + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_tokenizer_es_es(self): + input_text = "¡Buenos días!" + expected_output = "¡bwˈenos dˈias!" + + g2p = IPAG2P(phoneme_dict=self.PHONEME_DICT_ES, locale="es-ES") + tokenizer = IPATokenizer(g2p=g2p, locale="es-ES") + chars, tokens = self._parse_text(tokenizer, input_text) + + assert chars == expected_output diff --git a/tests/nemo_text_processing/g2p/phoneme_dict/test_dict_de.txt b/tests/nemo_text_processing/g2p/phoneme_dict/test_dict_de.txt new file mode 100644 index 000000000000..5c912b276bf6 --- /dev/null +++ b/tests/nemo_text_processing/g2p/phoneme_dict/test_dict_de.txt @@ -0,0 +1,2 @@ +HALLO hˈaloː +WELT vˈɛlt diff --git a/tests/nemo_text_processing/g2p/phoneme_dict/test_dict_es.txt b/tests/nemo_text_processing/g2p/phoneme_dict/test_dict_es.txt new file mode 100644 index 000000000000..112575be7c68 --- /dev/null +++ b/tests/nemo_text_processing/g2p/phoneme_dict/test_dict_es.txt @@ -0,0 +1,2 @@ +HOLA ˈola +MUNDO mˈundo diff --git a/tests/nemo_text_processing/g2p/test_modules.py b/tests/nemo_text_processing/g2p/test_modules.py index ccc6a23f7ade..0fff1018de78 100644 --- a/tests/nemo_text_processing/g2p/test_modules.py +++ b/tests/nemo_text_processing/g2p/test_modules.py @@ -20,11 +20,15 @@ class TestModules: - PHONEME_DICT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "phoneme_dict", "test_dict.txt") + PHONEME_DICT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "phoneme_dict") + PHONEME_DICT_PATH_DE = os.path.join(PHONEME_DICT_DIR, "test_dict_de.txt") + PHONEME_DICT_PATH_EN = os.path.join(PHONEME_DICT_DIR, "test_dict.txt") + PHONEME_DICT_PATH_ES = os.path.join(PHONEME_DICT_DIR, "test_dict_es.txt") @staticmethod def _create_g2p( - phoneme_dict=PHONEME_DICT_PATH, + phoneme_dict=PHONEME_DICT_PATH_EN, + locale=None, apply_to_oov_word=lambda x: x, use_chars=False, phoneme_probability=None, @@ -32,6 +36,7 @@ def _create_g2p( ): return IPAG2P( phoneme_dict, + locale=locale, apply_to_oov_word=apply_to_oov_word, use_chars=use_chars, phoneme_probability=phoneme_probability, @@ -84,6 +89,22 @@ def test_ipa_g2p(self): phonemes = g2p(input_text) assert phonemes == expected_output + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_g2p_with_dict_input(self): + input_text = "Hello world." + expected_output = [char for char in "həˈɫoʊ ˈwɝɫd."] + + phoneme_dict = {"HELLO": ["həˈɫoʊ"], "WORLD": ["ˈwɝɫd"]} + + g2p_file = self._create_g2p() + g2p_dict = self._create_g2p(phoneme_dict=phoneme_dict) + + phonemes_file = g2p_file(input_text) + phonemes_dict = g2p_dict(input_text) + assert phonemes_dict == expected_output + assert phonemes_file == phonemes_dict + @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_ipa_g2p_with_oov(self): @@ -123,3 +144,49 @@ def test_ipa_g2p_graphemes_lower(self): phonemes = g2p(input_text) assert phonemes == expected_output + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_g2p_with_escaped_characters(self): + input_text = "Hello |wo rld|." + expected_output = ["h", "ə", "ˈ", "ɫ", "o", "ʊ", " ", "wo", "rld", "."] + g2p = self._create_g2p() + + phonemes = g2p(input_text) + assert phonemes == expected_output + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_g2p_unsupported_locale(self): + with pytest.raises(ValueError, match="Unsupported locale"): + self._create_g2p(locale="en-USA") + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_g2p_de_de(self): + input_text = "Hallo „welt“!" + expected_output = [char for char in "hˈaloː „vˈɛlt“!"] + g2p = self._create_g2p(phoneme_dict=self.PHONEME_DICT_PATH_DE, locale="de-DE") + + phonemes = g2p(input_text) + assert phonemes == expected_output + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_g2p_en_us(self): + input_text = "Hello Kitty!" + expected_output = [char for char in "həˈɫoʊ KITTY!"] + g2p = self._create_g2p(locale="en-US") + + phonemes = g2p(input_text) + assert phonemes == expected_output + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_g2p_es_es(self): + input_text = "¿Hola mundo, amigo?" + expected_output = [char for char in "¿ˈola mˈundo, AMIGO?"] + g2p = self._create_g2p(phoneme_dict=self.PHONEME_DICT_PATH_ES, locale="es-ES") + + phonemes = g2p(input_text) + assert phonemes == expected_output