Remove Unicode Block features (#86)

google · Nov 10, 2022 · ab7b522 · ab7b522
1 parent d27ce10
commit ab7b522
Show file tree

Hide file tree

Showing 12 changed files with 7 additions and 225 deletions.
diff --git a/budoux/feature_extractor.py b/budoux/feature_extractor.py
@@ -13,30 +13,10 @@
 # limitations under the License.
 """Methods to encode source sentences to features."""
 
-import bisect
-import json
-import os
 import typing
 
 from .utils import INVALID
 
-with open(os.path.join(os.path.dirname(__file__), 'unicode_blocks.json')) as f:
-  block_starts: typing.List[int] = json.load(f)
-
-
-def unicode_block_index(w: str) -> str:
-  """Returns the index of the Unicode block that the character belongs to.
-
-  Args:
-    w (str): A character.
-
-  Returns:
-    index (str): Unicode block index in three digits.
-  """
-  if not w or w == INVALID:
-    return INVALID
-  return '%03d' % (bisect.bisect_right(block_starts, ord(w[0])))
-
 
 def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
                 w6: str) -> typing.List[str]:
@@ -54,12 +34,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
     The feature (list[str]).
 
   """
-  b1 = unicode_block_index(w1)
-  b2 = unicode_block_index(w2)
-  b3 = unicode_block_index(w3)
-  b4 = unicode_block_index(w4)
-  b5 = unicode_block_index(w5)
-  b6 = unicode_block_index(w6)
   raw_feature = {
       'UW1': w1,
       'UW2': w2,
@@ -74,19 +48,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
       'TW2': w2 + w3 + w4,
       'TW3': w3 + w4 + w5,
       'TW4': w4 + w5 + w6,
-      'UB1': b1,
-      'UB2': b2,
-      'UB3': b3,
-      'UB4': b4,
-      'UB5': b5,
-      'UB6': b6,
-      'BB1': b2 + b3,
-      'BB2': b3 + b4,
-      'BB3': b4 + b5,
-      'TB1': b1 + b2 + b3,
-      'TB2': b2 + b3 + b4,
-      'TB3': b3 + b4 + b5,
-      'TB4': b4 + b5 + b6,
   }
   for key, value in list(raw_feature.items()):
     if INVALID in value:

diff --git a/budoux/models/ja-knbc.json b/budoux/models/ja-knbc.json
diff --git a/budoux/models/zh-hans.json b/budoux/models/zh-hans.json
diff --git a/budoux/unicode_blocks.json b/budoux/unicode_blocks.json
diff --git a/javascript/scripts/copy-data.js b/javascript/scripts/copy-data.js
@@ -21,15 +21,6 @@ const PROJECT_ROOT = path.join(__dirname, '..', '..');
 const DATA_DIR = path.join(PROJECT_ROOT, 'javascript', 'src', 'data');
 fs.mkdirSync(path.join(DATA_DIR, 'models'), {recursive: true});
 
-const copyUnicodeBlocks = () => {
-  const sourcePath = path.join(PROJECT_ROOT, 'budoux', 'unicode_blocks.json');
-  const targetPath = path.join(DATA_DIR, 'unicode_blocks.ts');
-  fs.writeFileSync(
-    targetPath,
-    `export const unicodeBlocks = ${fs.readFileSync(sourcePath)}`
-  );
-};
-
 const copyModels = () => {
   const modelsDirPath = path.join(PROJECT_ROOT, 'budoux', 'models');
   const files = fs.readdirSync(modelsDirPath);
@@ -57,7 +48,6 @@ const copySkipNodes = () => {
 };
 
 const main = () => {
-  copyUnicodeBlocks();
   copyModels();
   copySkipNodes();
 };

diff --git a/javascript/src/parser.ts b/javascript/src/parser.ts
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-import {unicodeBlocks} from './data/unicode_blocks.js';
 import {model as jaKNBCModel} from './data/models/ja-knbc.js';
 import {model as zhHansModel} from './data/models/zh-hans.js';
 import {parseFromString} from './dom.js';
 import {HTMLProcessor} from './html_processor.js';
-import {bisectRight, INVALID, sum} from './utils.js';
+import {INVALID, sum} from './utils.js';
 
 // We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
 // but we define the same here for Node.js environments.
@@ -35,20 +34,6 @@ export class Parser {
     this.model = model;
   }
 
-  /**
-   * Generates a Unicode Block feature from the given character.
-   *
-   * @param w A character input.
-   * @returns A Unicode Block feature.
-   */
-  static getUnicodeBlockFeature(w: string) {
-    if (!w || w === INVALID) return INVALID;
-    const cp = w.codePointAt(0);
-    if (cp === undefined) return INVALID;
-    const bn = bisectRight(unicodeBlocks, cp);
-    return `${bn}`.padStart(3, '0');
-  }
-
   /**
    * Generates a feature from characters around (w1-w6).
    *
@@ -68,12 +53,6 @@ export class Parser {
     w5: string,
     w6: string
   ) {
-    const b1 = Parser.getUnicodeBlockFeature(w1);
-    const b2 = Parser.getUnicodeBlockFeature(w2);
-    const b3 = Parser.getUnicodeBlockFeature(w3);
-    const b4 = Parser.getUnicodeBlockFeature(w4);
-    const b5 = Parser.getUnicodeBlockFeature(w5);
-    const b6 = Parser.getUnicodeBlockFeature(w6);
     const rawFeature = {
       UW1: w1,
       UW2: w2,
@@ -88,19 +67,6 @@ export class Parser {
       TW2: w2 + w3 + w4,
       TW3: w3 + w4 + w5,
       TW4: w4 + w5 + w6,
-      UB1: b1,
-      UB2: b2,
-      UB3: b3,
-      UB4: b4,
-      UB5: b5,
-      UB6: b6,
-      BB1: b2 + b3,
-      BB2: b3 + b4,
-      BB3: b4 + b5,
-      TB1: b1 + b2 + b3,
-      TB2: b2 + b3 + b4,
-      TB3: b3 + b4 + b5,
-      TB4: b4 + b5 + b6,
     };
     return Object.entries(rawFeature)
       .filter(entry => !entry[1].includes(INVALID))

diff --git a/javascript/src/tests/test_html_processor.ts b/javascript/src/tests/test_html_processor.ts
@@ -103,9 +103,7 @@ describe('HTMLProcessor.applyToElement.separator.node', () => {
     expect(document.body.innerHTML).toEqual(
       '<div class="applied">今日は' +
         '<span style="white-space: normal;">\u200B</span>' +
-        '良い' +
-        '<span style="white-space: normal;">\u200B</span>' +
-        '天気</div>'
+        '良い天気</div>'
     );
   });
 });

diff --git a/javascript/src/tests/test_parser.ts b/javascript/src/tests/test_parser.ts
@@ -19,43 +19,13 @@ import {JSDOM} from 'jsdom';
 import {Parser} from '../parser.js';
 import {INVALID} from '../utils.js';
 
-describe('Parser.getUnicodeBlockFeature', () => {
-  const testFeature = (character: string, feature: string) => {
-    const result = Parser.getUnicodeBlockFeature(character);
-    expect(result).toBe(feature);
-  };
-  it('"a" should be the 1st block "Basic Latin".', () => {
-    testFeature('a', '001');
-  });
-  it('"あ" should be the 108th block "Hiragana".', () => {
-    testFeature('あ', '108');
-  });
-  it('"安" should be the 120th block "Kanji"', () => {
-    testFeature('安', '120');
-  });
-  it('Only the first character should be recoghnized', () => {
-    testFeature('あ安', '108');
-  });
-  it('Should return INVALID when a blank string is given.', () => {
-    testFeature('', INVALID);
-  });
-  it('Should return INVALID when INVALID is given.', () => {
-    testFeature(INVALID, INVALID);
-  });
-});
-
 describe('Parser.getFeature', () => {
   const feature = Parser.getFeature('a', 'b', 'c', 'd', 'e', 'f');
 
   it('should include certain features.', () => {
     expect(feature).toContain('UW1:a');
-    expect(feature).toContain('UB1:001');
-
     expect(feature).toContain('BW1:bc');
-    expect(feature).toContain('BB1:001001');
-
     expect(feature).toContain('TW1:abc');
-    expect(feature).toContain('TB1:001001001');
   });
 });
 
@@ -69,9 +39,7 @@ describe('Parser.getFeature with invalid inputs.', () => {
   };
   it('should not include invalid features.', () => {
     expect(findByPrefix('UW3:', feature)).toBeFalse();
-    expect(findByPrefix('UB3:', feature)).toBeFalse();
     expect(findByPrefix('BW2:', feature)).toBeFalse();
-    expect(findByPrefix('BB2:', feature)).toBeFalse();
   });
 });
 

diff --git a/javascript/src/tests/test_utils.ts b/javascript/src/tests/test_utils.ts
diff --git a/javascript/src/utils.ts b/javascript/src/utils.ts
@@ -14,28 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * Finds the insertion point maintaining the sorted order with a basic
- * bisection algorithm. This works the same as Python's bisect.bisect_right
- * method.
- *
- * @param arr The sorted array.
- * @param i The item to check the insertion point.
- * @returns The insertion point.
- */
-export const bisectRight = (arr: number[], i: number): number => {
-  const mid = Math.floor(arr.length / 2);
-  if (i === arr[mid]) {
-    return mid + 1;
-  } else if (i < arr[mid]) {
-    if (arr.length === 1) return 0;
-    return bisectRight(arr.slice(0, mid), i);
-  } else {
-    if (arr.length === 1) return 1;
-    return mid + bisectRight(arr.slice(mid), i);
-  }
-};
-
 /**
  * Finds the sum of the numbers in the list.
  * @param arr The list of numbers.

diff --git a/tests/test_feature_extractor.py b/tests/test_feature_extractor.py
@@ -21,21 +21,6 @@
 
 class TestFeatureExtractor(unittest.TestCase):
 
-  def test_unicode_block_index(self) -> None:
-
-    def check(character: str, block: str, msg: str) -> None:
-      self.assertEqual(
-          feature_extractor.unicode_block_index(character), block, msg)
-
-    check('a', '001', '"a" should be the 1st block "Basic Latin".')
-    check('あ', '108', '"あ" should be the 108th block "Hiragana".')
-    check('安', '120', '"安" should be the 120th block "Kanji"')
-    check('あ安', '108', 'Only the first character should be recognized')
-    check('', utils.INVALID,
-          'Should return INVALID when a blank string is given.')
-    check(utils.INVALID, utils.INVALID,
-          'Should return INVALID when INVALID is given.')
-
   def test_get_feature(self) -> None:
     feature = feature_extractor.get_feature('a', 'b', 'c', 'd', 'e', 'f')
     self.assertSetEqual(
@@ -49,31 +34,16 @@ def test_get_feature(self) -> None:
             'UW5:e',
             'UW6:f',
 
-            # Unigram of Unicode Blocks (UB)
-            'UB1:001',
-            'UB2:001',
-            'UB3:001',
-            'UB4:001',
-            'UB5:001',
-            'UB6:001',
-
-            # Bigram of Words (BW) and Unicode Blocks (BB)
+            # Bigram of Words (BW)
             'BW1:bc',
             'BW2:cd',
             'BW3:de',
-            'BB1:001001',
-            'BB2:001001',
-            'BB3:001001',
 
-            # Trigram of Words (TW) and Unicode Blocks (TB)
+            # Trigram of Words (TW)
             'TW1:abc',
             'TW2:bcd',
             'TW3:cde',
             'TW4:def',
-            'TB1:001001001',
-            'TB2:001001001',
-            'TB3:001001001',
-            'TB4:001001001',
         },
         'Features should be extracted.')
 
@@ -88,15 +58,9 @@ def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool:
     self.assertFalse(
         find_by_prefix('UW3:', feature),
         'Should omit the Unigram feature when the character is invalid.')
-    self.assertFalse(
-        find_by_prefix('UB3:', feature),
-        'Should omit the Unicode block feature when the character is invalid.')
     self.assertFalse(
         find_by_prefix('BW2:', feature),
         'Should omit the Bigram feature that covers an invalid character.')
-    self.assertFalse(
-        find_by_prefix('BB2:', feature),
-        'Should omit the Unicode feature that covers an invalid character.')
 
 
 if __name__ == '__main__':

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -82,7 +82,7 @@ def test_cmdargs_lang_ja(self) -> None:
     cmdargs = ['-l', 'ja', '今日はいい天気ですね。']
     output = main._main(cmdargs)
 
-    self.assertEqual(output, '今日は\nいい\n天気ですね。')
+    self.assertEqual(output, '今日は\nいい天気ですね。')
 
   def test_cmdargs_lang_zh_hans(self) -> None:
     cmdargs = ['-l', 'zh-hans', '今天天气晴朗。']