Remove the P features (#80)

* Remove the past result features * Update the Japanese model * Update the Simplified Chinese model
google · Aug 3, 2022 · 89c56db · 89c56db
1 parent b767d79
commit 89c56db
Show file tree

Hide file tree

Showing 7 changed files with 12 additions and 78 deletions.
diff --git a/budoux/feature_extractor.py b/budoux/feature_extractor.py
@@ -38,9 +38,9 @@ def unicode_block_index(w: str) -> str:
   return '%03d' % (bisect.bisect_right(block_starts, ord(w[0])))
 
 
-def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
-                p2: str, p3: str) -> typing.List[str]:
-  """Generates a feature from characters around (w1-6) and past results (p1-3).
+def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
+                w6: str) -> typing.List[str]:
+  """Generates a feature from characters around (w1-6).
 
   Args:
     w1 (str): The character 3 characters before the break point.
@@ -49,9 +49,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
     w4 (str): The character right after the break point.
     w5 (str): The character 2 characters after the break point.
     w6 (str): The character 3 characters after the break point.
-    p1 (str): The result 3 steps ago.
-    p2 (str): The result 2 steps ago.
-    p3 (str): The last result.
 
   Returns:
     The feature (list[str]).
@@ -64,11 +61,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
   b5 = unicode_block_index(w5)
   b6 = unicode_block_index(w6)
   raw_feature = {
-      'UP1': p1,
-      'UP2': p2,
-      'UP3': p3,
-      'BP1': p1 + p2,
-      'BP2': p2 + p3,
       'UW1': w1,
       'UW2': w2,
       'UW3': w3,
@@ -95,17 +87,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
       'TB2': b2 + b3 + b4,
       'TB3': b3 + b4 + b5,
       'TB4': b4 + b5 + b6,
-      'UQ1': p1 + b1,
-      'UQ2': p2 + b2,
-      'UQ3': p3 + b3,
-      'BQ1': p2 + b2 + b3,
-      'BQ2': p2 + b3 + b4,
-      'BQ3': p3 + b2 + b3,
-      'BQ4': p3 + b3 + b4,
-      'TQ1': p2 + b1 + b2 + b3,
-      'TQ2': p2 + b2 + b3 + b4,
-      'TQ3': p3 + b1 + b2 + b3,
-      'TQ4': p3 + b2 + b3 + b4,
   }
   for key, value in list(raw_feature.items()):
     if INVALID in value:

diff --git a/budoux/models/ja-knbc.json b/budoux/models/ja-knbc.json
diff --git a/budoux/models/zh-hans.json b/budoux/models/zh-hans.json
diff --git a/budoux/parser.py b/budoux/parser.py
@@ -19,7 +19,7 @@
 from html.parser import HTMLParser
 
 from .feature_extractor import get_feature
-from .utils import INVALID, SEP, Result
+from .utils import INVALID, SEP
 
 MODEL_DIR = os.path.join(os.path.dirname(__file__), 'models')
 PARENT_CSS_STYLE = 'word-break: keep-all; overflow-wrap: break-word;'
@@ -121,27 +121,20 @@ def parse(self, sentence: str) -> typing.List[str]:
     """
     if sentence == '':
       return []
-    p1 = Result.UNKNOWN.value
-    p2 = Result.UNKNOWN.value
-    p3 = Result.UNKNOWN.value
     chunks = [sentence[0]]
     base_score = -sum(self.model.values())
     for i in range(1, len(sentence)):
       feature = get_feature(
           sentence[i - 3] if i > 2 else INVALID,
           sentence[i - 2] if i > 1 else INVALID, sentence[i - 1], sentence[i],
           sentence[i + 1] if i + 1 < len(sentence) else INVALID,
-          sentence[i + 2] if i + 2 < len(sentence) else INVALID, p1, p2, p3)
+          sentence[i + 2] if i + 2 < len(sentence) else INVALID)
       score = base_score + 2 * sum(
           self.model[f] for f in feature if f in self.model)
       if score > 0:
         chunks.append(sentence[i])
       else:
         chunks[-1] += sentence[i]
-      p = Result.POSITIVE.value if score > 0 else Result.NEGATIVE.value
-      p1 = p2
-      p2 = p3
-      p3 = p
     return chunks
 
   def translate_html_string(self, html: str) -> str:

diff --git a/budoux/utils.py b/budoux/utils.py
@@ -13,17 +13,8 @@
 # limitations under the License.
 """Utilities for BudouX."""
 
-from enum import Enum
-
 SEP = '▁'
 """The separator string to specify breakpoints."""
 
 INVALID = '▔'
 """The invalid feature string."""
-
-
-class Result(Enum):
-  """An enum to represent the type of inference result."""
-  UNKNOWN = 'U'
-  POSITIVE = 'B'
-  NEGATIVE = 'O'
diff --git a/scripts/encode_data.py b/scripts/encode_data.py
@@ -31,23 +31,16 @@ def process(line: str, entries_filename: str) -> None:
   chunk_lengths = [len(chunk) for chunk in chunks]
   sep_indices = set(itertools.accumulate(chunk_lengths, lambda x, y: x + y))
   sentence = ''.join(chunks)
-  p1 = utils.Result.UNKNOWN.value
-  p2 = utils.Result.UNKNOWN.value
-  p3 = utils.Result.UNKNOWN.value
   lines = []
   for i in range(1, len(sentence) + 1):
     feature = feature_extractor.get_feature(
         sentence[i - 3] if i > 2 else utils.INVALID,
         sentence[i - 2] if i > 1 else utils.INVALID, sentence[i - 1],
         sentence[i] if i < len(sentence) else utils.INVALID,
         sentence[i + 1] if i + 1 < len(sentence) else utils.INVALID,
-        sentence[i + 2] if i + 2 < len(sentence) else utils.INVALID, p1, p2, p3)
+        sentence[i + 2] if i + 2 < len(sentence) else utils.INVALID)
     positive = i in sep_indices
-    p = utils.Result.POSITIVE.value if positive else utils.Result.NEGATIVE.value
     lines.append('\t'.join(['1' if positive else '-1'] + feature) + '\n')
-    p1 = p2
-    p2 = p3
-    p3 = p
   with open(entries_filename, 'a', encoding=sys.getdefaultencoding()) as f:
     f.write(''.join(lines))
 

diff --git a/tests/test_feature_extractor.py b/tests/test_feature_extractor.py
@@ -37,8 +37,7 @@ def check(character: str, block: str, msg: str) -> None:
           'Should return INVALID when INVALID is given.')
 
   def test_get_feature(self) -> None:
-    feature = feature_extractor.get_feature('a', 'b', 'c', 'd', 'e', 'f', 'x',
-                                            'y', 'z')
+    feature = feature_extractor.get_feature('a', 'b', 'c', 'd', 'e', 'f')
     self.assertSetEqual(
         set(feature),
         {
@@ -50,11 +49,6 @@ def test_get_feature(self) -> None:
             'UW5:e',
             'UW6:f',
 
-            # Unigram of Previous Results (UP)
-            'UP1:x',
-            'UP2:y',
-            'UP3:z',
-
             # Unigram of Unicode Blocks (UB)
             'UB1:001',
             'UB2:001',
@@ -63,28 +57,15 @@ def test_get_feature(self) -> None:
             'UB5:001',
             'UB6:001',
 
-            # Combination of UW and UP
-            'UQ1:x001',
-            'UQ2:y001',
-            'UQ3:z001',
-
-            # Bigram of Words (BW), Previous Results (BP), Unicode Blocks (BB), and
-            # its combination (BQ)
+            # Bigram of Words (BW) and Unicode Blocks (BB)
             'BW1:bc',
             'BW2:cd',
             'BW3:de',
-            'BP1:xy',
-            'BP2:yz',
             'BB1:001001',
             'BB2:001001',
             'BB3:001001',
-            'BQ1:y001001',
-            'BQ2:y001001',
-            'BQ3:z001001',
-            'BQ4:z001001',
 
-            # Trigram of Words (BW), Previous Results (BP), Unicode Blocks (BB), and
-            # its combination (BQ)
+            # Trigram of Words (TW) and Unicode Blocks (TB)
             'TW1:abc',
             'TW2:bcd',
             'TW3:cde',
@@ -93,10 +74,6 @@ def test_get_feature(self) -> None:
             'TB2:001001001',
             'TB3:001001001',
             'TB4:001001001',
-            'TQ1:y001001001',
-            'TQ2:y001001001',
-            'TQ3:z001001001',
-            'TQ4:z001001001',
         },
         'Features should be extracted.')
 
@@ -107,7 +84,7 @@ def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool:
       return False
 
     feature = feature_extractor.get_feature('a', 'a', utils.INVALID, 'a', 'a',
-                                            'a', 'a', 'a', 'a')
+                                            'a')
     self.assertFalse(
         find_by_prefix('UW3:', feature),
         'Should omit the Unigram feature when the character is invalid.')