From 7b833a16be6a369efd5ecece74240f4251760559 Mon Sep 17 00:00:00 2001 From: andreihar <95883512+andreihar@users.noreply.github.com> Date: Fri, 26 Apr 2024 01:58:49 -0700 Subject: [PATCH] Redefined sandhi to allow for more control --- taibun/taibun.py | 17 +++++++++-------- tests/test_sandhi.py | 4 ++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/taibun/taibun.py b/taibun/taibun.py index f1f2056..8960ca4 100644 --- a/taibun/taibun.py +++ b/taibun/taibun.py @@ -6,12 +6,13 @@ """ Description: Converts Chinese characters to Taiwanese Hokkien phonetic transcriptions. Supports both Traditional and Simplified characters. -Invariant: dialect = `south` (Zhangzhou-leaning, default), `north` (Quanzhou-leaning) - system = `Tailo` (default), `POJ`, `Zhuyin`, `TLPA`, `Pingyim`, `Tongiong`, `IPA` +Invariant: system = `Tailo` (default), `POJ`, `Zhuyin`, `TLPA`, `Pingyim`, `Tongiong`, `IPA` + dialect = `south` (Zhangzhou-leaning, default), `north` (Quanzhou-leaning) format = `mark` (diacritical), `number` (numeric), `strip` (no tones) - sandhi = True, False delimiter = String that replaces the default delimiter + sandhi = `auto`, `none`, `exc_last`, `incl_last` punctuation = `format` (Latin-style, default), `none` (preserve original) + convert_non_cjk = True, False """ @@ -114,8 +115,8 @@ def __set_default_delimiter(self): # Helper functions to set sandhi according to transliteration system if wasn't explicitly defined by user def __set_default_sandhi(self): - if self.system == 'tongiong': return True - return False + if self.system == 'tongiong': return 'auto' + return 'none' ### Conversion functions @@ -124,10 +125,10 @@ def __set_default_sandhi(self): def __get_number_tones(self, input): words = self.__preprocess_word(input[0]) number_tones = [self.__get_number_tone(w) for w in words if len(w) > 0] - if self.sandhi or self.format == 'number': + if self.sandhi == 'auto' or self.format == 'number': replace_with_zero = False number_tones = [s[:-1] + '0' if replace_with_zero or (replace_with_zero := s[-1] == '0') else s for s in number_tones] - if self.sandhi: + if self.sandhi == 'auto': index = next((i for i, s in enumerate(number_tones) if s.startswith(self.suffix_token)), len(number_tones)) number_tones = self.__tone_sandhi(number_tones[:index], False) + number_tones[index:] if len(number_tones) != index and len(number_tones) > 1 else self.__tone_sandhi(number_tones, input[1]) return number_tones @@ -157,7 +158,7 @@ def __get_number_tone(self, input): elif re.search('̍', input): input += '8' elif input[-1] in finals: input += '4' else: input += '1' - if input.startswith(self.suffix_token) and (input[-2] == 'h' or self.sandhi or self.format == 'number'): + if input.startswith(self.suffix_token) and (input[-2] == 'h' or self.sandhi == 'auto' or self.format == 'number'): input = input[:-1] + '0' input = "".join(c for c in unicodedata.normalize("NFD", input) if unicodedata.category(c) != "Mn") return input diff --git a/tests/test_sandhi.py b/tests/test_sandhi.py index 5e63358..a2efce3 100644 --- a/tests/test_sandhi.py +++ b/tests/test_sandhi.py @@ -27,7 +27,7 @@ def test_true(): ] for transl, system in test_data: data = [f"{h},{t}" for h, t in zip(hanji_data, transl)] - checker(data, Converter(system=system, punctuation='none', sandhi=True), Converter(system=system, dialect="north", punctuation='none', sandhi=True)) + checker(data, Converter(system=system, punctuation='none', sandhi='auto'), Converter(system=system, dialect="north", punctuation='none', sandhi='auto')) def test_false(): test_data = [ @@ -54,4 +54,4 @@ def test_sentence(): ] for transl, system in test_data: data = [f"{h},{t}" for h, t in zip(hanji_data, transl)] - checker(data, Converter(system=system, sandhi=True), Converter(system=system, dialect="north", sandhi=True)) \ No newline at end of file + checker(data, Converter(system=system, sandhi='auto'), Converter(system=system, dialect="north", sandhi='auto')) \ No newline at end of file