add chinese dialect cleaners

huangyisan · Oct 11, 2022 · 99fd14b · 99fd14b
1 parent 4e0f6cd
commit 99fd14b
Show file tree

Hide file tree

Showing 9 changed files with 228 additions and 48 deletions.
diff --git a/text/cantonese.py b/text/cantonese.py
@@ -0,0 +1,59 @@
+import re
+import cn2an
+import opencc
+
+
+converter = opencc.OpenCC('jyutjyu')
+
+# List of (Latin alphabet, ipa) pairs:
+_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('A', 'ei˥'),
+    ('B', 'biː˥'),
+    ('C', 'siː˥'),
+    ('D', 'tiː˥'),
+    ('E', 'iː˥'),
+    ('F', 'e˥fuː˨˩'),
+    ('G', 'tsiː˥'),
+    ('H', 'ɪk̚˥tsʰyː˨˩'),
+    ('I', 'ɐi˥'),
+    ('J', 'tsei˥'),
+    ('K', 'kʰei˥'),
+    ('L', 'e˥llou˨˩'),
+    ('M', 'ɛːm˥'),
+    ('N', 'ɛːn˥'),
+    ('O', 'ou˥'),
+    ('P', 'pʰiː˥'),
+    ('Q', 'kʰiːu˥'),
+    ('R', 'aː˥lou˨˩'),
+    ('S', 'ɛː˥siː˨˩'),
+    ('T', 'tʰiː˥'),
+    ('U', 'juː˥'),
+    ('V', 'wiː˥'),
+    ('W', 'tʊk̚˥piː˥juː˥'),
+    ('X', 'ɪk̚˥siː˨˩'),
+    ('Y', 'waːi˥'),
+    ('Z', 'iː˨sɛːt̚˥')
+]]
+
+
+def number_to_cantonese(text):
+    return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
+
+
+def latin_to_ipa(text):
+    for regex, replacement in _latin_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def cantonese_to_ipa(text):
+    text = number_to_cantonese(text.upper())
+    text = converter.convert(text).replace('-','').replace('$',' ')
+    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
+    text = re.sub(r'[、；：]', '，', text)
+    text = re.sub(r'\s*，\s*', ', ', text)
+    text = re.sub(r'\s*。\s*', '. ', text)
+    text = re.sub(r'\s*？\s*', '? ', text)
+    text = re.sub(r'\s*！\s*', '! ', text)
+    text = re.sub(r'\s*$', '', text)
+    return text
diff --git a/text/cleaners.py b/text/cleaners.py
@@ -1,11 +1,13 @@
 import re
-from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2
+from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
 from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
-from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa
+from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
 from text.sanskrit import devanagari_to_ipa
-from text.english import english_to_lazy_ipa, english_to_ipa2
+from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
 from text.thai import num_to_thai, latin_to_thai
 from text.shanghainese import shanghainese_to_ipa
+from text.cantonese import cantonese_to_ipa
+from text.ngu_dialect import ngu_dialect_to_ipa
 
 
 def japanese_cleaners(text):
@@ -152,3 +154,23 @@ def shanghainese_cleaners(text):
     if re.match(r'[^\.,!\?\-…~]', text[-1]):
         text += '.'
     return text
+
+
+def chinese_dialect_cleaners(text):
+    text = re.sub(r'\[MD\](.*?)\[MD\]',
+                  lambda x: chinese_to_ipa2(x.group(1))+' ', text)
+    text = re.sub(r'\[TW\](.*?)\[TW\]',
+                  lambda x: chinese_to_ipa2(x.group(1), True)+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
+    text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
+                  '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
+    text = re.sub(r'\[GD\](.*?)\[GD\]',
+                  lambda x: cantonese_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
+    text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
+        1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+    return text
diff --git a/text/japanese.py b/text/japanese.py
@@ -51,15 +51,15 @@
 # List of (consonant, sokuon) pairs:
 _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
     (r'Q([↑↓]*[kg])', r'k#\1'),
-    (r'Q([↑↓]*[tdjʦʧʥ])', r't#\1'),
+    (r'Q([↑↓]*[tdjʧ])', r't#\1'),
     (r'Q([↑↓]*[sʃ])', r's\1'),
     (r'Q([↑↓]*[pb])', r'p#\1')
 ]]
 
 # List of (consonant, hatsuon) pairs:
 _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
     (r'N([↑↓]*[pbm])', r'm\1'),
-    (r'N([↑↓]*(?:[ʧʥj]|tʃ))', r'n^\1'),
+    (r'N([↑↓]*[ʧʥj])', r'n^\1'),
     (r'N([↑↓]*[tdn])', r'n\1'),
     (r'N([↑↓]*[kg])', r'ŋ\1')
 ]]
@@ -127,7 +127,7 @@ def get_real_hatsuon(text):
 def japanese_to_ipa(text):
     text = japanese_to_romaji_with_accent(text).replace('...', '…')
     text = re.sub(
-        r'([A-Za-zɯ])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
+        r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
     text = get_real_sokuon(text)
     text = get_real_hatsuon(text)
     for regex, replacement in _romaji_to_ipa:
@@ -142,4 +142,12 @@ def japanese_to_ipa2(text):
     for regex, replacement in _romaji_to_ipa2:
         text = re.sub(regex, replacement, text)
     return text
-print(japanese_to_ipa('そうでした。'))
+
+
+def japanese_to_ipa3(text):
+    text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
+        'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
+    text = re.sub(
+        r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
+    text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
+    return text
diff --git a/text/lexicons/zaonhe.json b/text/lexicons/zaonhe.json
diff --git a/text/lexicons/zaonhe.ocd2 b/text/lexicons/zaonhe.ocd2
diff --git a/text/mandarin.py b/text/mandarin.py
@@ -4,11 +4,6 @@
 from pypinyin import lazy_pinyin, BOPOMOFO
 import jieba
 import cn2an
-import logging
-
-logging.getLogger('jieba').setLevel(logging.WARNING)
-jieba.set_dictionary(os.path.dirname(sys.argv[0])+'/jieba/dict.txt')
-jieba.initialize()
 
 
 # List of (Latin alphabet, bopomofo) pairs:
@@ -174,6 +169,68 @@
     ('—', '-')
 ]]
 
+# List of (bopomofo, ipa2) pairs:
+_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄅㄛ', 'pwo'),
+    ('ㄆㄛ', 'pʰwo'),
+    ('ㄇㄛ', 'mwo'),
+    ('ㄈㄛ', 'fwo'),
+    ('ㄅ', 'p'),
+    ('ㄆ', 'pʰ'),
+    ('ㄇ', 'm'),
+    ('ㄈ', 'f'),
+    ('ㄉ', 't'),
+    ('ㄊ', 'tʰ'),
+    ('ㄋ', 'n'),
+    ('ㄌ', 'l'),
+    ('ㄍ', 'k'),
+    ('ㄎ', 'kʰ'),
+    ('ㄏ', 'h'),
+    ('ㄐ', 'tɕ'),
+    ('ㄑ', 'tɕʰ'),
+    ('ㄒ', 'ɕ'),
+    ('ㄓ', 'tʂ'),
+    ('ㄔ', 'tʂʰ'),
+    ('ㄕ', 'ʂ'),
+    ('ㄖ', 'ɻ'),
+    ('ㄗ', 'ts'),
+    ('ㄘ', 'tsʰ'),
+    ('ㄙ', 's'),
+    ('ㄚ', 'a'),
+    ('ㄛ', 'o'),
+    ('ㄜ', 'ɤ'),
+    ('ㄝ', 'ɛ'),
+    ('ㄞ', 'aɪ'),
+    ('ㄟ', 'eɪ'),
+    ('ㄠ', 'ɑʊ'),
+    ('ㄡ', 'oʊ'),
+    ('ㄧㄢ', 'jɛn'),
+    ('ㄩㄢ', 'yæn'),
+    ('ㄢ', 'an'),
+    ('ㄧㄣ', 'in'),
+    ('ㄩㄣ', 'yn'),
+    ('ㄣ', 'ən'),
+    ('ㄤ', 'ɑŋ'),
+    ('ㄧㄥ', 'iŋ'),
+    ('ㄨㄥ', 'ʊŋ'),
+    ('ㄩㄥ', 'jʊŋ'),
+    ('ㄥ', 'ɤŋ'),
+    ('ㄦ', 'əɻ'),
+    ('ㄧ', 'i'),
+    ('ㄨ', 'u'),
+    ('ㄩ', 'y'),
+    ('ˉ', '˥'),
+    ('ˊ', '˧˥'),
+    ('ˇ', '˨˩˦'),
+    ('ˋ', '˥˩'),
+    ('˙', ''),
+    ('，', ','),
+    ('。', '.'),
+    ('！', '!'),
+    ('？', '?'),
+    ('—', '-')
+]]
+
 
 def number_to_chinese(text):
     numbers = re.findall(r'\d+(?:\.?\d+)?', text)
@@ -182,7 +239,7 @@ def number_to_chinese(text):
     return text
 
 
-def chinese_to_bopomofo(text):
+def chinese_to_bopomofo(text, taiwanese=False):
     text = text.replace('、', '，').replace('；', '，').replace('：', '，')
     words = jieba.lcut(text, cut_all=False)
     text = ''
@@ -192,11 +249,13 @@ def chinese_to_bopomofo(text):
             text += word
             continue
         for i in range(len(bopomofos)):
-            if re.match('[\u3105-\u3129]', bopomofos[i][-1]):
-                bopomofos[i] += 'ˉ'
+            bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
         if text != '':
             text += ' '
-        text += ''.join(bopomofos)
+        if taiwanese:
+            text += '#'+'#'.join(bopomofos)
+        else:
+            text += ''.join(bopomofos)
     return text
 
 
@@ -218,17 +277,22 @@ def bopomofo_to_ipa(text):
     return text
 
 
+def bopomofo_to_ipa2(text):
+    for regex, replacement in _bopomofo_to_ipa2:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
 def chinese_to_romaji(text):
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
     text = bopomofo_to_romaji(text)
-    text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text)
-    text = re.sub('u[aoəe]', lambda x: 'w'+x.group(0)[1:], text)
-    text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) +
-                  'ɹ`'+x.group(2), text).replace('ɻ', 'ɹ`')
-    text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)',
-                  lambda x: x.group(1)+'ɹ'+x.group(2), text)
+    text = re.sub('i([aoe])', r'y\1', text)
+    text = re.sub('u([aoəe])', r'w\1', text)
+    text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
+                  r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
+    text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
     return text
 
 
@@ -244,10 +308,21 @@ def chinese_to_ipa(text):
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
     text = bopomofo_to_ipa(text)
-    text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text)
-    text = re.sub('u[aoəe]', lambda x: 'w'+x.group(0)[1:], text)
-    text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) +
-                  'ɹ`'+x.group(2), text).replace('ɻ', 'ɹ`')
-    text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)',
-                  lambda x: x.group(1)+'ɹ'+x.group(2), text)
+    text = re.sub('i([aoe])', r'j\1', text)
+    text = re.sub('u([aoəe])', r'w\1', text)
+    text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
+                  r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
+    text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
+    return text
+
+
+def chinese_to_ipa2(text, taiwanese=False):
+    text = number_to_chinese(text)
+    text = chinese_to_bopomofo(text, taiwanese)
+    text = latin_to_bopomofo(text)
+    text = bopomofo_to_ipa2(text)
+    text = re.sub(r'i([aoe])', r'j\1', text)
+    text = re.sub(r'u([aoəe])', r'w\1', text)
+    text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
+    text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
     return text
diff --git a/text/ngu_dialect.py b/text/ngu_dialect.py
@@ -0,0 +1,29 @@
+import re
+import opencc
+
+
+dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
+            'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
+            'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
+            'JS': 'jiashan', 'XS': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
+            'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen', 'TT': 'tiantai'}
+
+converters = {}
+
+for dialect in dialects.values():
+    try:
+        converters[dialect] = opencc.OpenCC(dialect)
+    except:
+        pass
+
+
+def ngu_dialect_to_ipa(text, dialect):
+    dialect = dialects[dialect]
+    text = converters[dialect].convert(text).replace('$',' ')
+    text = re.sub(r'[、；：]', '，', text)
+    text = re.sub(r'\s*，\s*', ', ', text)
+    text = re.sub(r'\s*。\s*', '. ', text)
+    text = re.sub(r'\s*？\s*', '? ', text)
+    text = re.sub(r'\s*！\s*', '! ', text)
+    text = re.sub(r'\s*$', '', text)
+    return text
diff --git a/text/shanghainese.py b/text/shanghainese.py
@@ -3,7 +3,7 @@
 import opencc
 
 
-converter = opencc.OpenCC(os.path.dirname(sys.argv[0])+'/lexicons/zaonhe.json')
+converter = opencc.OpenCC('zaonhe')
 
 # List of (Latin alphabet, ipa) pairs:
 _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [

diff --git a/text/symbols.py b/text/symbols.py
@@ -62,6 +62,12 @@
 _letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
 '''
 
+'''# chinese_dialect_cleaners
+_pad        = '_'
+_punctuation = ',.!?~…─'
+_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚αᴀᴇ↑↓∅ⱼ '
+'''
+
 # Export all symbols:
 symbols = [_pad] + list(_punctuation) + list(_letters)