Skip to content

Commit

Permalink
add chinese dialect cleaners
Browse files Browse the repository at this point in the history
  • Loading branch information
CjangCjengh committed Oct 11, 2022
1 parent 4e0f6cd commit 99fd14b
Show file tree
Hide file tree
Showing 9 changed files with 228 additions and 48 deletions.
59 changes: 59 additions & 0 deletions text/cantonese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import re
import cn2an
import opencc


converter = opencc.OpenCC('jyutjyu')

# List of (Latin alphabet, ipa) pairs:
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
('A', 'ei˥'),
('B', 'biː˥'),
('C', 'siː˥'),
('D', 'tiː˥'),
('E', 'iː˥'),
('F', 'e˥fuː˨˩'),
('G', 'tsiː˥'),
('H', 'ɪk̚˥tsʰyː˨˩'),
('I', 'ɐi˥'),
('J', 'tsei˥'),
('K', 'kʰei˥'),
('L', 'e˥llou˨˩'),
('M', 'ɛːm˥'),
('N', 'ɛːn˥'),
('O', 'ou˥'),
('P', 'pʰiː˥'),
('Q', 'kʰiːu˥'),
('R', 'aː˥lou˨˩'),
('S', 'ɛː˥siː˨˩'),
('T', 'tʰiː˥'),
('U', 'juː˥'),
('V', 'wiː˥'),
('W', 'tʊk̚˥piː˥juː˥'),
('X', 'ɪk̚˥siː˨˩'),
('Y', 'waːi˥'),
('Z', 'iː˨sɛːt̚˥')
]]


def number_to_cantonese(text):
return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)


def latin_to_ipa(text):
for regex, replacement in _latin_to_ipa:
text = re.sub(regex, replacement, text)
return text


def cantonese_to_ipa(text):
text = number_to_cantonese(text.upper())
text = converter.convert(text).replace('-','').replace('$',' ')
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
text = re.sub(r'[、;:]', ',', text)
text = re.sub(r'\s*,\s*', ', ', text)
text = re.sub(r'\s*。\s*', '. ', text)
text = re.sub(r'\s*?\s*', '? ', text)
text = re.sub(r'\s*!\s*', '! ', text)
text = re.sub(r'\s*$', '', text)
return text
28 changes: 25 additions & 3 deletions text/cleaners.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import re
from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2
from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
from text.sanskrit import devanagari_to_ipa
from text.english import english_to_lazy_ipa, english_to_ipa2
from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
from text.thai import num_to_thai, latin_to_thai
from text.shanghainese import shanghainese_to_ipa
from text.cantonese import cantonese_to_ipa
from text.ngu_dialect import ngu_dialect_to_ipa


def japanese_cleaners(text):
Expand Down Expand Up @@ -152,3 +154,23 @@ def shanghainese_cleaners(text):
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text


def chinese_dialect_cleaners(text):
text = re.sub(r'\[MD\](.*?)\[MD\]',
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
text = re.sub(r'\[TW\](.*?)\[TW\]',
lambda x: chinese_to_ipa2(x.group(1), True)+' ', text)
text = re.sub(r'\[JA\](.*?)\[JA\]',
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
'˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
text = re.sub(r'\[GD\](.*?)\[GD\]',
lambda x: cantonese_to_ipa(x.group(1))+' ', text)
text = re.sub(r'\[EN\](.*?)\[EN\]',
lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
text = re.sub(r'\s+$', '', text)
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
return text
16 changes: 12 additions & 4 deletions text/japanese.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,15 @@
# List of (consonant, sokuon) pairs:
_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
(r'Q([↑↓]*[kg])', r'k#\1'),
(r'Q([↑↓]*[tdjʦʧʥ])', r't#\1'),
(r'Q([↑↓]*[tdjʧ])', r't#\1'),
(r'Q([↑↓]*[sʃ])', r's\1'),
(r'Q([↑↓]*[pb])', r'p#\1')
]]

# List of (consonant, hatsuon) pairs:
_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
(r'N([↑↓]*[pbm])', r'm\1'),
(r'N([↑↓]*(?:[ʧʥj]|tʃ))', r'n^\1'),
(r'N([↑↓]*[ʧʥj])', r'n^\1'),
(r'N([↑↓]*[tdn])', r'n\1'),
(r'N([↑↓]*[kg])', r'ŋ\1')
]]
Expand Down Expand Up @@ -127,7 +127,7 @@ def get_real_hatsuon(text):
def japanese_to_ipa(text):
text = japanese_to_romaji_with_accent(text).replace('...', '…')
text = re.sub(
r'([A-Za-zɯ])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
text = get_real_sokuon(text)
text = get_real_hatsuon(text)
for regex, replacement in _romaji_to_ipa:
Expand All @@ -142,4 +142,12 @@ def japanese_to_ipa2(text):
for regex, replacement in _romaji_to_ipa2:
text = re.sub(regex, replacement, text)
return text
print(japanese_to_ipa('そうでした。'))


def japanese_to_ipa3(text):
text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
text = re.sub(
r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
return text
19 changes: 0 additions & 19 deletions text/lexicons/zaonhe.json

This file was deleted.

Binary file removed text/lexicons/zaonhe.ocd2
Binary file not shown.
117 changes: 96 additions & 21 deletions text/mandarin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@
from pypinyin import lazy_pinyin, BOPOMOFO
import jieba
import cn2an
import logging

logging.getLogger('jieba').setLevel(logging.WARNING)
jieba.set_dictionary(os.path.dirname(sys.argv[0])+'/jieba/dict.txt')
jieba.initialize()


# List of (Latin alphabet, bopomofo) pairs:
Expand Down Expand Up @@ -174,6 +169,68 @@
('—', '-')
]]

# List of (bopomofo, ipa2) pairs:
_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
('ㄅㄛ', 'pwo'),
('ㄆㄛ', 'pʰwo'),
('ㄇㄛ', 'mwo'),
('ㄈㄛ', 'fwo'),
('ㄅ', 'p'),
('ㄆ', 'pʰ'),
('ㄇ', 'm'),
('ㄈ', 'f'),
('ㄉ', 't'),
('ㄊ', 'tʰ'),
('ㄋ', 'n'),
('ㄌ', 'l'),
('ㄍ', 'k'),
('ㄎ', 'kʰ'),
('ㄏ', 'h'),
('ㄐ', 'tɕ'),
('ㄑ', 'tɕʰ'),
('ㄒ', 'ɕ'),
('ㄓ', 'tʂ'),
('ㄔ', 'tʂʰ'),
('ㄕ', 'ʂ'),
('ㄖ', 'ɻ'),
('ㄗ', 'ts'),
('ㄘ', 'tsʰ'),
('ㄙ', 's'),
('ㄚ', 'a'),
('ㄛ', 'o'),
('ㄜ', 'ɤ'),
('ㄝ', 'ɛ'),
('ㄞ', 'aɪ'),
('ㄟ', 'eɪ'),
('ㄠ', 'ɑʊ'),
('ㄡ', 'oʊ'),
('ㄧㄢ', 'jɛn'),
('ㄩㄢ', 'yæn'),
('ㄢ', 'an'),
('ㄧㄣ', 'in'),
('ㄩㄣ', 'yn'),
('ㄣ', 'ən'),
('ㄤ', 'ɑŋ'),
('ㄧㄥ', 'iŋ'),
('ㄨㄥ', 'ʊŋ'),
('ㄩㄥ', 'jʊŋ'),
('ㄥ', 'ɤŋ'),
('ㄦ', 'əɻ'),
('ㄧ', 'i'),
('ㄨ', 'u'),
('ㄩ', 'y'),
('ˉ', '˥'),
('ˊ', '˧˥'),
('ˇ', '˨˩˦'),
('ˋ', '˥˩'),
('˙', ''),
(',', ','),
('。', '.'),
('!', '!'),
('?', '?'),
('—', '-')
]]


def number_to_chinese(text):
numbers = re.findall(r'\d+(?:\.?\d+)?', text)
Expand All @@ -182,7 +239,7 @@ def number_to_chinese(text):
return text


def chinese_to_bopomofo(text):
def chinese_to_bopomofo(text, taiwanese=False):
text = text.replace('、', ',').replace(';', ',').replace(':', ',')
words = jieba.lcut(text, cut_all=False)
text = ''
Expand All @@ -192,11 +249,13 @@ def chinese_to_bopomofo(text):
text += word
continue
for i in range(len(bopomofos)):
if re.match('[\u3105-\u3129]', bopomofos[i][-1]):
bopomofos[i] += 'ˉ'
bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
if text != '':
text += ' '
text += ''.join(bopomofos)
if taiwanese:
text += '#'+'#'.join(bopomofos)
else:
text += ''.join(bopomofos)
return text


Expand All @@ -218,17 +277,22 @@ def bopomofo_to_ipa(text):
return text


def bopomofo_to_ipa2(text):
for regex, replacement in _bopomofo_to_ipa2:
text = re.sub(regex, replacement, text)
return text


def chinese_to_romaji(text):
text = number_to_chinese(text)
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
text = bopomofo_to_romaji(text)
text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text)
text = re.sub('u[aoəe]', lambda x: 'w'+x.group(0)[1:], text)
text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) +
'ɹ`'+x.group(2), text).replace('ɻ', 'ɹ`')
text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)',
lambda x: x.group(1)+'ɹ'+x.group(2), text)
text = re.sub('i([aoe])', r'y\1', text)
text = re.sub('u([aoəe])', r'w\1', text)
text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
return text


Expand All @@ -244,10 +308,21 @@ def chinese_to_ipa(text):
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
text = bopomofo_to_ipa(text)
text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text)
text = re.sub('u[aoəe]', lambda x: 'w'+x.group(0)[1:], text)
text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) +
'ɹ`'+x.group(2), text).replace('ɻ', 'ɹ`')
text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)',
lambda x: x.group(1)+'ɹ'+x.group(2), text)
text = re.sub('i([aoe])', r'j\1', text)
text = re.sub('u([aoəe])', r'w\1', text)
text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
return text


def chinese_to_ipa2(text, taiwanese=False):
text = number_to_chinese(text)
text = chinese_to_bopomofo(text, taiwanese)
text = latin_to_bopomofo(text)
text = bopomofo_to_ipa2(text)
text = re.sub(r'i([aoe])', r'j\1', text)
text = re.sub(r'u([aoəe])', r'w\1', text)
text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
return text
29 changes: 29 additions & 0 deletions text/ngu_dialect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import re
import opencc


dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
'JS': 'jiashan', 'XS': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen', 'TT': 'tiantai'}

converters = {}

for dialect in dialects.values():
try:
converters[dialect] = opencc.OpenCC(dialect)
except:
pass


def ngu_dialect_to_ipa(text, dialect):
dialect = dialects[dialect]
text = converters[dialect].convert(text).replace('$',' ')
text = re.sub(r'[、;:]', ',', text)
text = re.sub(r'\s*,\s*', ', ', text)
text = re.sub(r'\s*。\s*', '. ', text)
text = re.sub(r'\s*?\s*', '? ', text)
text = re.sub(r'\s*!\s*', '! ', text)
text = re.sub(r'\s*$', '', text)
return text
2 changes: 1 addition & 1 deletion text/shanghainese.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import opencc


converter = opencc.OpenCC(os.path.dirname(sys.argv[0])+'/lexicons/zaonhe.json')
converter = opencc.OpenCC('zaonhe')

# List of (Latin alphabet, ipa) pairs:
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
Expand Down
6 changes: 6 additions & 0 deletions text/symbols.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@
_letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
'''

'''# chinese_dialect_cleaners
_pad = '_'
_punctuation = ',.!?~…─'
_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚αᴀᴇ↑↓∅ⱼ '
'''

# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters)

Expand Down

0 comments on commit 99fd14b

Please sign in to comment.