Skip to content

Commit

Permalink
Revert "Jpg2p jun18 (#9538)" (#9874)
Browse files Browse the repository at this point in the history
* Revert "Jpg2p jun18 (#9538)"

This reverts commit 53d7a91.

* Apply isort and black reformatting

Signed-off-by: pablo-garay <[email protected]>

---------

Signed-off-by: pablo-garay <[email protected]>
Co-authored-by: pablo-garay <[email protected]>
Signed-off-by: Alexandros Koumparoulis <[email protected]>
  • Loading branch information
2 people authored and akoumpa committed Jul 25, 2024
1 parent ef703f8 commit 57249b4
Show file tree
Hide file tree
Showing 7 changed files with 3 additions and 17,241 deletions.
59 changes: 3 additions & 56 deletions nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# fmt: off

SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "ja-JP"]
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"]

DEFAULT_PUNCTUATION = (
',', '.', '!', '?', '-',
Expand Down Expand Up @@ -104,17 +104,6 @@
'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ',
'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː',
),
"ja-JP": (
'a', 'i', 'u', 'e', 'o', 'ɯ', 'I', 'ɑ' , 'ɨ ', 'ɒ',
'ɔ', 'iᵑ', 'eᵑ', 'a', 'ʊ', 'ə', 'eᵝ', 'ɐ', 'ɛ',
'w', 'k', 'ɾ', 's', 't', 'ʃ', 'r', 'h', 'n', 'nʲ',
'ɲ', 'ç', 'b', 'm', 'j', 'ɸ', 'z', 'p', 'd', 'N',
'ʒ', 'ŋ', 'g', 'f', 'ʔ', 'y', 'ɟ', 'v', 'ɥ', 'ɰ',
'ɰᵝ', 'ɣ', 'ʄ', 'ʑ', 'c', 'ɕ', 'ɠ', 'x', 'l', 'β',
'ð', 'ø', 'ʁ', 'ts', 'tʃ', 'dʒ', 'y', 'dʑ', 't͡s',
'ɑ̃', 'ĩ', 'ũ', 'ẽ', 'õ', 'ɑ̃', 'ĩ', 'ũ', 'w̃',
'ẽ', 'õ', 'hʲ', 'ɪ', 'ː', 'o̞', 'e̞',
),
}

GRAPHEME_CHARACTER_CASES = ["upper", "lower", "mixed"]
Expand Down Expand Up @@ -168,7 +157,7 @@ def get_ipa_punctuation_list(locale):
punct_set = set(DEFAULT_PUNCTUATION)
# TODO @xueyang: verify potential mismatches with locale-specific punctuation sets used
# in nemo_text_processing.text_normalization.en.taggers.punctuation.py
if locale in ["de-DE", "es-ES", "it-IT", "fr-FR", "ja-JP"]:
if locale in ["de-DE", "es-ES", "it-IT", "fr-FR"]:
# ref: https://en.wikipedia.org/wiki/Guillemet#Uses
punct_set.update(['«', '»', '‹', '›'])
if locale == "de-DE":
Expand Down Expand Up @@ -229,48 +218,6 @@ def get_ipa_punctuation_list(locale):
'̧', # combining cedilla, U+0327, decimal 807
]
)
elif locale == "ja-JP":
# ref: https://en.wikipedia.org/wiki/List_of_Japanese_typographic_symbols
punct_set.update(
[
'【',
'】',
'…',
'‥',
'「',
'」',
'『',
'』',
'〜',
'。',
'、',
'ー',
'・・・',
'〃',
'〔',
'〕',
'⦅',
'⦆',
'〈',
'〉',
'《',
'》',
'〖',
'〗',
'〘',
'〙',
'〚',
'〛',
'•',
'◦',
'﹅',
'﹆',
'※',
'*',
'〽',
'〓',
'〒',
]
)

punct_list = sorted(list(punct_set))
return punct_list
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
"english_word_tokenize",
"LATIN_CHARS_ALL",
"normalize_unicode_text",
"japanese_text_preprocessing",
]

# Derived from LJSpeech
Expand Down Expand Up @@ -202,7 +201,3 @@ def chinese_text_preprocessing(text: str) -> str:

def french_text_preprocessing(text: str) -> str:
return text.lower()


def japanese_text_preprocessing(text: str) -> str:
return text.lower()
112 changes: 0 additions & 112 deletions nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
english_text_preprocessing,
french_text_preprocessing,
italian_text_preprocessing,
japanese_text_preprocessing,
spanish_text_preprocessing,
)
from nemo.utils import logging
Expand Down Expand Up @@ -927,114 +926,3 @@ def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None):
ps = [space] + ps + [space]

return [self._token2id[p] for p in ps]


class JapanesePhonemeTokenizer(BaseTokenizer):

JA_PUNCT_LIST = get_ipa_punctuation_list("ja-JP")

def __init__(
self,
g2p,
punct=True,
non_default_punct_list=None,
*,
space=' ',
silence=None,
apostrophe=True,
sep='|', # To be able to distinguish between 2/3 letters codes.
add_blank_at=None,
pad_with_space=False,
text_preprocessing_func=japanese_text_preprocessing,
):
"""Japanese phoneme-based tokenizer.
Note: This tokenizer for now covers Japanese phonemes
Args:
g2p: Grapheme to phoneme module.
punct: Whether to reserve grapheme for basic punctuation or not.
non_default_punct_list: List of punctuation marks which will be used instead default.
space: Space token as string.
silence: Silence token as string (will be disabled if it is None).
apostrophe: Whether to use apostrophe or not.
sep: Separation token as string.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
Basically, it replaces all non-unicode characters with unicode ones.
Note that lower() function shouldn't be applied here, in case the text contains phonemes (it will be handled by g2p).
"""
tokens = []
self.space, tokens = len(tokens), tokens + [space] # Space

if silence is not None:
self.silence, tokens = len(tokens), tokens + [silence] # Silence

self.phoneme_list = g2p.phoneme_list
self.ascii_letter_list = g2p.ascii_letter_list

tokens.extend(self.phoneme_list)
tokens.extend(self.ascii_letter_list)

self.text_preprocessing_func = text_preprocessing_func

if apostrophe:
tokens.append("'") # Apostrophe

if punct:
if non_default_punct_list is not None:
self.PUNCT_LIST = non_default_punct_list
else:
self.PUNCT_LIST = list(self.JA_PUNCT_LIST)
tokens.extend(self.PUNCT_LIST)

super().__init__(tokens, sep=sep, add_blank_at=add_blank_at)

self.punct = punct
self.pad_with_space = pad_with_space
self.g2p = g2p

def encode(self, text: str) -> List[int]:
"""See base class for more information."""
text = self.text_preprocessing_func(text)
g2p_text = self.g2p(text)
return self.encode_from_g2p(g2p_text, text)

def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None):
"""
Encodes text that has already been run through G2P.
Called for encoding to tokens after text preprocessing and G2P.
Args:
g2p_text: G2P's output, could be a mixture of Chinese phonemes and English letters.
raw_text: original raw input
"""
ps, space, tokens = [], self.tokens[self.space], set(self.tokens)
for p in g2p_text: # noqa
# Add space if last one isn't one
if p == space and len(ps) > 0 and ps[-1] != space:
ps.append(p)
# Add next phoneme or tone or ascii letter or apostrophe.
elif (
p.isalnum() or p == "'" or p in self.phoneme_list + self.tone_list + self.ascii_letter_list
) and p in tokens:
ps.append(p)
# Add punctuation
elif (p in self.PUNCT_LIST) and self.punct:
ps.append(p)
# Warn about unknown char/phoneme
elif p != space:
message = f"Text: [{' '.join(g2p_text)}] contains unknown char/phoneme: [{p}]."
if raw_text is not None:
message += f"Original text: [{raw_text}]. Symbol will be skipped."
logging.warning(message)

# Remove trailing spaces
if ps:
while ps[-1] == space:
ps.pop()

if self.pad_with_space:
ps = [space] + ps + [space]

return [self._token2id[p] for p in ps]
157 changes: 0 additions & 157 deletions nemo/collections/tts/g2p/models/ja_jp_ipa.py

This file was deleted.

2 changes: 0 additions & 2 deletions requirements/requirements_tts.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
attrdict
cutlet
einops
janome
jieba
kornia
librosa
Expand Down
Loading

0 comments on commit 57249b4

Please sign in to comment.