Update the model structure for faster processing (#112)

google · Feb 8, 2023 · 4be9bcc · 4be9bcc
1 parent d0e863d
commit 4be9bcc
Show file tree

Hide file tree

Showing 14 changed files with 243 additions and 151 deletions.
diff --git a/budoux/models/ja.json b/budoux/models/ja.json
diff --git a/budoux/models/zh-hans.json b/budoux/models/zh-hans.json
diff --git a/budoux/models/zh-hant.json b/budoux/models/zh-hant.json
diff --git a/budoux/parser.py b/budoux/parser.py
@@ -18,8 +18,7 @@
 import typing
 from html.parser import HTMLParser
 
-from .feature_extractor import get_feature
-from .utils import INVALID, SEP
+from .utils import SEP
 
 MODEL_DIR = os.path.join(os.path.dirname(__file__), 'models')
 PARENT_CSS_STYLE = 'word-break: keep-all; overflow-wrap: break-word;'
@@ -104,11 +103,11 @@ class Parser:
     model: A dict mapping a feature (str) and its score (int).
   """
 
-  def __init__(self, model: typing.Dict[str, int]):
+  def __init__(self, model: typing.Dict[str, typing.Dict[str, int]]):
     """Initializes the parser.
 
     Args:
-      model (Dict[str, int]): A dict mapping a feature and its score.
+      model (Dict[str, Dict[str, int]]): A dict mapping a feature and its score.
     """
     self.model = model
 
@@ -124,15 +123,35 @@ def parse(self, sentence: str) -> typing.List[str]:
     if sentence == '':
       return []
     chunks = [sentence[0]]
-    base_score = -sum(self.model.values())
+    base_score = -sum(sum(g.values()) for g in self.model.values()) * 0.5
     for i in range(1, len(sentence)):
-      feature = get_feature(
-          sentence[i - 3] if i > 2 else INVALID,
-          sentence[i - 2] if i > 1 else INVALID, sentence[i - 1], sentence[i],
-          sentence[i + 1] if i + 1 < len(sentence) else INVALID,
-          sentence[i + 2] if i + 2 < len(sentence) else INVALID)
-      score = base_score + 2 * sum(
-          self.model[f] for f in feature if f in self.model)
+      score = base_score
+      if i > 2:
+        score += self.model.get('UW1', {}).get(sentence[i - 3], 0)
+      if i > 1:
+        score += self.model.get('UW2', {}).get(sentence[i - 2], 0)
+      score += self.model.get('UW3', {}).get(sentence[i - 1], 0)
+      score += self.model.get('UW4', {}).get(sentence[i], 0)
+      if i + 1 < len(sentence):
+        score += self.model.get('UW5', {}).get(sentence[i + 1], 0)
+      if i + 2 < len(sentence):
+        score += self.model.get('UW6', {}).get(sentence[i + 2], 0)
+
+      if i > 1:
+        score += self.model.get('BW1', {}).get(sentence[i - 2:i], 0)
+      score += self.model.get('BW2', {}).get(sentence[i - 1:i + 1], 0)
+      if i + 1 < len(sentence):
+        score += self.model.get('BW3', {}).get(sentence[i:i + 2], 0)
+
+      if i > 2:
+        score += self.model.get('TW1', {}).get(sentence[i - 3:i], 0)
+      if i > 1:
+        score += self.model.get('TW2', {}).get(sentence[i - 2:i + 1], 0)
+      if i + 1 < len(sentence):
+        score += self.model.get('TW3', {}).get(sentence[i - 1:i + 2], 0)
+      if i + 2 < len(sentence):
+        score += self.model.get('TW4', {}).get(sentence[i:i + 3], 0)
+
       if score > 0:
         chunks.append(sentence[i])
       else:

diff --git a/javascript/README.md b/javascript/README.md
@@ -94,8 +94,9 @@ You can load your own custom model as follows.
 
 ```javascript
 import { Parser } from 'budoux';
-const model = JSON.parse('{"BB2:108120": 1817}');  // Content of the custom model JSON file.
-const parser = new Parser(new Map(Object.entries(model)));
+const model = JSON.parse('{"UW4": {"a": 133}}');  // Content of the custom model JSON file.
+const parser = new Parser(model);
+parser.parse('xyzabc');  // ['xyz', 'abc']
 ```
 
 ## Web components

diff --git a/javascript/scripts/copy-data.js b/javascript/scripts/copy-data.js
@@ -33,7 +33,7 @@ const copyModels = () => {
     const content = fs.readFileSync(sourcePath);
     fs.writeFileSync(
       targetPath,
-      `export const model: {[key:string]: number} = ${content}`
+      `export const model: {[key:string]: {[key:string]: number}} = ${content}`
     );
   });
 };

diff --git a/javascript/src/cli.ts b/javascript/src/cli.ts
@@ -133,6 +133,6 @@ const outputParsedTexts = (
  */
 const loadCustomParser = (modelPath: string) => {
   const file = readFileSync(path.resolve(modelPath)).toString();
-  const json = JSON.parse(file);
-  return new Parser(new Map(Object.entries(json)));
+  const model = JSON.parse(file);
+  return new Parser(model);
 };
diff --git a/javascript/src/parser.ts b/javascript/src/parser.ts
@@ -19,7 +19,7 @@ import {model as zhHansModel} from './data/models/zh-hans.js';
 import {model as zhHantModel} from './data/models/zh-hant.js';
 import {parseFromString} from './dom.js';
 import {HTMLProcessor} from './html_processor.js';
-import {INVALID, sum} from './utils.js';
+import {sum} from './utils.js';
 
 // We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
 // but we define the same here for Node.js environments.
@@ -31,47 +31,10 @@ const NODETYPE = {
 export class Parser {
   model;
 
-  constructor(model: Map<string, number>) {
-    this.model = model;
-  }
-
-  /**
-   * Generates a feature from characters around (w1-w6).
-   *
-   * @param w1 The character 3 characters before the break point.
-   * @param w2 The character 2 characters before the break point.
-   * @param w3 The character right before the break point.
-   * @param w4 The character right after the break point.
-   * @param w5 The character 2 characters after the break point.
-   * @param w6 The character 3 characters after the break point.
-   * @returns A feature to be consumed by a classifier.
-   */
-  static getFeature(
-    w1: string,
-    w2: string,
-    w3: string,
-    w4: string,
-    w5: string,
-    w6: string
-  ) {
-    const rawFeature = {
-      UW1: w1,
-      UW2: w2,
-      UW3: w3,
-      UW4: w4,
-      UW5: w5,
-      UW6: w6,
-      BW1: w2 + w3,
-      BW2: w3 + w4,
-      BW3: w4 + w5,
-      TW1: w1 + w2 + w3,
-      TW2: w2 + w3 + w4,
-      TW3: w3 + w4 + w5,
-      TW4: w4 + w5 + w6,
-    };
-    return Object.entries(rawFeature)
-      .filter(entry => !entry[1].includes(INVALID))
-      .map(([key, value]) => `${key}:${value}`);
+  constructor(model: {[key: string]: {[key: string]: number}}) {
+    this.model = new Map(
+      Object.entries(model).map(([k, v]) => [k, new Map(Object.entries(v))])
+    );
   }
 
   /**
@@ -96,19 +59,25 @@ export class Parser {
   parse(sentence: string) {
     if (sentence === '') return [];
     const result = [sentence[0]];
-    const baseScore = -sum([...this.model.values()]);
+    const baseScore =
+      -0.5 *
+      sum([...this.model.values()].map(group => [...group.values()]).flat());
 
     for (let i = 1; i < sentence.length; i++) {
-      const feature = Parser.getFeature(
-        sentence[i - 3] || INVALID,
-        sentence[i - 2] || INVALID,
-        sentence[i - 1],
-        sentence[i],
-        sentence[i + 1] || INVALID,
-        sentence[i + 2] || INVALID
-      );
-      const score =
-        baseScore + 2 * sum(feature.map(f => this.model.get(f) || 0));
+      let score = baseScore;
+      score += this.model.get('UW1')?.get(sentence.slice(i - 3, i - 2)) || 0;
+      score += this.model.get('UW2')?.get(sentence.slice(i - 2, i - 1)) || 0;
+      score += this.model.get('UW3')?.get(sentence.slice(i - 1, i)) || 0;
+      score += this.model.get('UW4')?.get(sentence.slice(i, i + 1)) || 0;
+      score += this.model.get('UW5')?.get(sentence.slice(i + 1, i + 2)) || 0;
+      score += this.model.get('UW6')?.get(sentence.slice(i + 2, i + 3)) || 0;
+      score += this.model.get('BW1')?.get(sentence.slice(i - 2, i)) || 0;
+      score += this.model.get('BW2')?.get(sentence.slice(i - 1, i + 1)) || 0;
+      score += this.model.get('BW3')?.get(sentence.slice(i, i + 2)) || 0;
+      score += this.model.get('TW1')?.get(sentence.slice(i - 3, i)) || 0;
+      score += this.model.get('TW2')?.get(sentence.slice(i - 2, i + 1)) || 0;
+      score += this.model.get('TW3')?.get(sentence.slice(i - 1, i + 2)) || 0;
+      score += this.model.get('TW4')?.get(sentence.slice(i, i + 3)) || 0;
       if (score > 0) result.push('');
       result[result.length - 1] += sentence[i];
     }
@@ -150,23 +119,23 @@ export class Parser {
  * @returns A parser with the default Japanese model.
  */
 export const loadDefaultJapaneseParser = () => {
-  return new Parser(new Map(Object.entries(jaModel)));
+  return new Parser(jaModel);
 };
 
 /**
  * Loads a parser equipped with the default Simplified Chinese model.
  * @returns A parser with the default Simplified Chinese model.
  */
 export const loadDefaultSimplifiedChineseParser = () => {
-  return new Parser(new Map(Object.entries(zhHansModel)));
+  return new Parser(zhHansModel);
 };
 
 /**
  * Loads a parser equipped with the default Traditional Chinese model.
  * @returns A parser with the default Traditional Chinese model.
  */
 export const loadDefaultTraditionalChineseParser = () => {
-  return new Parser(new Map(Object.entries(zhHantModel)));
+  return new Parser(zhHantModel);
 };
 
 /**

diff --git a/javascript/src/tests/models/separate_right_before_a.json b/javascript/src/tests/models/separate_right_before_a.json
@@ -1 +1 @@
-{"UW4:a": 1001}
+{"UW4": {"a": 1001}}
diff --git a/javascript/src/tests/test_parser.ts b/javascript/src/tests/test_parser.ts
@@ -17,64 +17,38 @@
 import 'jasmine';
 import {JSDOM} from 'jsdom';
 import {Parser} from '../parser.js';
-import {INVALID} from '../utils.js';
-
-describe('Parser.getFeature', () => {
-  const feature = Parser.getFeature('a', 'b', 'c', 'd', 'e', 'f');
-
-  it('should include certain features.', () => {
-    expect(feature).toContain('UW1:a');
-    expect(feature).toContain('BW1:bc');
-    expect(feature).toContain('TW1:abc');
-  });
-});
-
-describe('Parser.getFeature with invalid inputs.', () => {
-  const feature = Parser.getFeature('a', 'a', INVALID, 'a', 'a', 'a');
-  const findByPrefix = (prefix: string, feature: string[]) => {
-    for (const item of feature) {
-      if (item.startsWith(prefix)) return true;
-    }
-    return false;
-  };
-  it('should not include invalid features.', () => {
-    expect(findByPrefix('UW3:', feature)).toBeFalse();
-    expect(findByPrefix('BW2:', feature)).toBeFalse();
-  });
-});
 
 describe('Parser.parse', () => {
   const TEST_SENTENCE = 'abcdeabcd';
 
   it('should separate if a strong feature item supports.', () => {
-    const model = new Map([
-      ['UW4:a', 10000], // means "should separate right before 'a'".
-    ]);
+    const model = {
+      UW4: {a: 10000}, // means "should separate right before 'a'".
+    };
     const parser = new Parser(model);
     const result = parser.parse(TEST_SENTENCE);
     expect(result).toEqual(['abcde', 'abcd']);
   });
 
   it('should separate even if it makes a phrase of one character.', () => {
-    const model = new Map([
-      ['UW4:b', 10000], // means "should separate right before 'b'".
-    ]);
+    const model = {
+      UW4: {b: 10000}, // means "should separate right before 'b'".
+    };
     const parser = new Parser(model);
     const result = parser.parse(TEST_SENTENCE);
     expect(result).toEqual(['a', 'bcdea', 'bcd']);
   });
 
   it('should return an empty list when the input is a blank string.', () => {
-    const model = new Map();
-    const parser = new Parser(model);
+    const parser = new Parser({});
     const result = parser.parse('');
     expect(result).toEqual([]);
   });
 });
 
 describe('Parser.applyElement', () => {
   const checkEqual = (
-    model: Map<string, number>,
+    model: {[key: string]: {[key: string]: number}},
     inputHTML: string,
     expectedHTML: string
   ) => {
@@ -96,29 +70,29 @@ describe('Parser.applyElement', () => {
     const expectedHTML = `
     <p style="word-break: keep-all; overflow-wrap: break-word;"
     >xyz<wbr>abc<wbr>abc</p>`;
-    const model = new Map([
-      ['UW4:a', 1001], // means "should separate right before 'a'".
-    ]);
+    const model = {
+      UW4: {a: 1001}, // means "should separate right before 'a'".
+    };
     checkEqual(model, inputHTML, expectedHTML);
   });
 
   it('should insert WBR tags even it overlaps with other HTML tags.', () => {
     const inputHTML = '<p>xy<a href="#">zabca</a>bc</p>';
     const expectedHTML = `<p style="word-break: keep-all; overflow-wrap: break-word;"
     >xy<a href="#">z<wbr>abc<wbr>a</a>bc</p>`;
-    const model = new Map([
-      ['UW4:a', 1001], // means "should separate right before 'a'".
-    ]);
+    const model = {
+      UW4: {a: 1001}, // means "should separate right before 'a'".
+    };
     checkEqual(model, inputHTML, expectedHTML);
   });
 });
 
 describe('Parser.translateHTMLString', () => {
-  const defaultModel = new Map([
-    ['UW4:a', 1001], // means "should separate right before 'a'".
-  ]);
+  const defaultModel = {
+    UW4: {a: 1001}, // means "should separate right before 'a'".
+  };
   const checkEqual = (
-    model: Map<string, number>,
+    model: {[key: string]: {[key: string]: number}},
     inputHTML: string,
     expectedHTML: string
   ) => {
@@ -150,8 +124,7 @@ describe('Parser.translateHTMLString', () => {
   it('should return a blank string if the input is blank.', () => {
     const inputHTML = '';
     const expectedHTML = '';
-    const model = new Map();
-    checkEqual(model, inputHTML, expectedHTML);
+    checkEqual({}, inputHTML, expectedHTML);
   });
 
   it('should pass script tags as-is.', () => {

diff --git a/javascript/src/webcomponents/budoux-base.ts b/javascript/src/webcomponents/budoux-base.ts
@@ -29,7 +29,7 @@ export abstract class BudouXBaseElement extends HTMLElement {
   constructor() {
     super();
 
-    this.parser = new Parser(new Map());
+    this.parser = new Parser({});
     this.shadow = this.attachShadow({mode: 'open'});
     const observer = new MutationObserver(this.sync.bind(this));
     observer.observe(this, {