Redefined sandhi to allow for more control

andreihar · Apr 26, 2024 · 7b833a1 · 7b833a1
1 parent f86e534
commit 7b833a1
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 10 deletions.
diff --git a/taibun/taibun.py b/taibun/taibun.py
@@ -6,12 +6,13 @@
 """
 Description: Converts Chinese characters to Taiwanese Hokkien phonetic transcriptions.
              Supports both Traditional and Simplified characters.
-Invariant: dialect = `south` (Zhangzhou-leaning, default), `north` (Quanzhou-leaning)
-           system = `Tailo` (default), `POJ`, `Zhuyin`, `TLPA`, `Pingyim`, `Tongiong`, `IPA`
+Invariant: system = `Tailo` (default), `POJ`, `Zhuyin`, `TLPA`, `Pingyim`, `Tongiong`, `IPA`
+           dialect = `south` (Zhangzhou-leaning, default), `north` (Quanzhou-leaning)
            format = `mark` (diacritical), `number` (numeric), `strip` (no tones)
-           sandhi = True, False
            delimiter = String that replaces the default delimiter
+           sandhi = `auto`, `none`, `exc_last`, `incl_last`
            punctuation = `format` (Latin-style, default), `none` (preserve original)
+           convert_non_cjk = True, False
 """
 
 
@@ -114,8 +115,8 @@ def __set_default_delimiter(self):
 
     # Helper functions to set sandhi according to transliteration system if wasn't explicitly defined by user
     def __set_default_sandhi(self):
-        if self.system == 'tongiong': return True
-        return False
+        if self.system == 'tongiong': return 'auto'
+        return 'none'
 
 
     ### Conversion functions
@@ -124,10 +125,10 @@ def __set_default_sandhi(self):
     def __get_number_tones(self, input):
         words = self.__preprocess_word(input[0])
         number_tones = [self.__get_number_tone(w) for w in words if len(w) > 0]
-        if self.sandhi or self.format == 'number':
+        if self.sandhi == 'auto' or self.format == 'number':
             replace_with_zero = False
             number_tones = [s[:-1] + '0' if replace_with_zero or (replace_with_zero := s[-1] == '0') else s for s in number_tones]
-        if self.sandhi:
+        if self.sandhi == 'auto':
             index = next((i for i, s in enumerate(number_tones) if s.startswith(self.suffix_token)), len(number_tones))
             number_tones = self.__tone_sandhi(number_tones[:index], False) + number_tones[index:] if len(number_tones) != index and len(number_tones) > 1 else self.__tone_sandhi(number_tones, input[1])
         return number_tones
@@ -157,7 +158,7 @@ def __get_number_tone(self, input):
         elif re.search('̍', input): input += '8'
         elif input[-1] in finals: input += '4'
         else: input += '1'
-        if input.startswith(self.suffix_token) and (input[-2] == 'h' or self.sandhi or self.format == 'number'):
+        if input.startswith(self.suffix_token) and (input[-2] == 'h' or self.sandhi == 'auto' or self.format == 'number'):
             input = input[:-1] + '0'
         input = "".join(c for c in unicodedata.normalize("NFD", input) if unicodedata.category(c) != "Mn")
         return input

diff --git a/tests/test_sandhi.py b/tests/test_sandhi.py
@@ -27,7 +27,7 @@ def test_true():
     ]
     for transl, system in test_data:
         data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]
-        checker(data, Converter(system=system, punctuation='none', sandhi=True), Converter(system=system, dialect="north", punctuation='none', sandhi=True))
+        checker(data, Converter(system=system, punctuation='none', sandhi='auto'), Converter(system=system, dialect="north", punctuation='none', sandhi='auto'))
 
 def test_false():
     test_data = [
@@ -54,4 +54,4 @@ def test_sentence():
     ]
     for transl, system in test_data:
         data = [f"{h},{t}" for h, t in zip(hanji_data, transl)]
-        checker(data, Converter(system=system, sandhi=True), Converter(system=system, dialect="north", sandhi=True))
+        checker(data, Converter(system=system, sandhi='auto'), Converter(system=system, dialect="north", sandhi='auto'))