Skip to content

Commit

Permalink
Update the model structure for faster processing (#112)
Browse files Browse the repository at this point in the history
  • Loading branch information
tushuhei authored Feb 8, 2023
1 parent d0e863d commit 4be9bcc
Show file tree
Hide file tree
Showing 14 changed files with 243 additions and 151 deletions.
2 changes: 1 addition & 1 deletion budoux/models/ja.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion budoux/models/zh-hans.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion budoux/models/zh-hant.json

Large diffs are not rendered by default.

43 changes: 31 additions & 12 deletions budoux/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
import typing
from html.parser import HTMLParser

from .feature_extractor import get_feature
from .utils import INVALID, SEP
from .utils import SEP

MODEL_DIR = os.path.join(os.path.dirname(__file__), 'models')
PARENT_CSS_STYLE = 'word-break: keep-all; overflow-wrap: break-word;'
Expand Down Expand Up @@ -104,11 +103,11 @@ class Parser:
model: A dict mapping a feature (str) and its score (int).
"""

def __init__(self, model: typing.Dict[str, int]):
def __init__(self, model: typing.Dict[str, typing.Dict[str, int]]):
"""Initializes the parser.
Args:
model (Dict[str, int]): A dict mapping a feature and its score.
model (Dict[str, Dict[str, int]]): A dict mapping a feature and its score.
"""
self.model = model

Expand All @@ -124,15 +123,35 @@ def parse(self, sentence: str) -> typing.List[str]:
if sentence == '':
return []
chunks = [sentence[0]]
base_score = -sum(self.model.values())
base_score = -sum(sum(g.values()) for g in self.model.values()) * 0.5
for i in range(1, len(sentence)):
feature = get_feature(
sentence[i - 3] if i > 2 else INVALID,
sentence[i - 2] if i > 1 else INVALID, sentence[i - 1], sentence[i],
sentence[i + 1] if i + 1 < len(sentence) else INVALID,
sentence[i + 2] if i + 2 < len(sentence) else INVALID)
score = base_score + 2 * sum(
self.model[f] for f in feature if f in self.model)
score = base_score
if i > 2:
score += self.model.get('UW1', {}).get(sentence[i - 3], 0)
if i > 1:
score += self.model.get('UW2', {}).get(sentence[i - 2], 0)
score += self.model.get('UW3', {}).get(sentence[i - 1], 0)
score += self.model.get('UW4', {}).get(sentence[i], 0)
if i + 1 < len(sentence):
score += self.model.get('UW5', {}).get(sentence[i + 1], 0)
if i + 2 < len(sentence):
score += self.model.get('UW6', {}).get(sentence[i + 2], 0)

if i > 1:
score += self.model.get('BW1', {}).get(sentence[i - 2:i], 0)
score += self.model.get('BW2', {}).get(sentence[i - 1:i + 1], 0)
if i + 1 < len(sentence):
score += self.model.get('BW3', {}).get(sentence[i:i + 2], 0)

if i > 2:
score += self.model.get('TW1', {}).get(sentence[i - 3:i], 0)
if i > 1:
score += self.model.get('TW2', {}).get(sentence[i - 2:i + 1], 0)
if i + 1 < len(sentence):
score += self.model.get('TW3', {}).get(sentence[i - 1:i + 2], 0)
if i + 2 < len(sentence):
score += self.model.get('TW4', {}).get(sentence[i:i + 3], 0)

if score > 0:
chunks.append(sentence[i])
else:
Expand Down
5 changes: 3 additions & 2 deletions javascript/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,9 @@ You can load your own custom model as follows.

```javascript
import { Parser } from 'budoux';
const model = JSON.parse('{"BB2:108120": 1817}'); // Content of the custom model JSON file.
const parser = new Parser(new Map(Object.entries(model)));
const model = JSON.parse('{"UW4": {"a": 133}}'); // Content of the custom model JSON file.
const parser = new Parser(model);
parser.parse('xyzabc'); // ['xyz', 'abc']
```

## Web components
Expand Down
2 changes: 1 addition & 1 deletion javascript/scripts/copy-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ const copyModels = () => {
const content = fs.readFileSync(sourcePath);
fs.writeFileSync(
targetPath,
`export const model: {[key:string]: number} = ${content}`
`export const model: {[key:string]: {[key:string]: number}} = ${content}`
);
});
};
Expand Down
4 changes: 2 additions & 2 deletions javascript/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,6 @@ const outputParsedTexts = (
*/
const loadCustomParser = (modelPath: string) => {
const file = readFileSync(path.resolve(modelPath)).toString();
const json = JSON.parse(file);
return new Parser(new Map(Object.entries(json)));
const model = JSON.parse(file);
return new Parser(model);
};
81 changes: 25 additions & 56 deletions javascript/src/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import {model as zhHansModel} from './data/models/zh-hans.js';
import {model as zhHantModel} from './data/models/zh-hant.js';
import {parseFromString} from './dom.js';
import {HTMLProcessor} from './html_processor.js';
import {INVALID, sum} from './utils.js';
import {sum} from './utils.js';

// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
// but we define the same here for Node.js environments.
Expand All @@ -31,47 +31,10 @@ const NODETYPE = {
export class Parser {
model;

constructor(model: Map<string, number>) {
this.model = model;
}

/**
* Generates a feature from characters around (w1-w6).
*
* @param w1 The character 3 characters before the break point.
* @param w2 The character 2 characters before the break point.
* @param w3 The character right before the break point.
* @param w4 The character right after the break point.
* @param w5 The character 2 characters after the break point.
* @param w6 The character 3 characters after the break point.
* @returns A feature to be consumed by a classifier.
*/
static getFeature(
w1: string,
w2: string,
w3: string,
w4: string,
w5: string,
w6: string
) {
const rawFeature = {
UW1: w1,
UW2: w2,
UW3: w3,
UW4: w4,
UW5: w5,
UW6: w6,
BW1: w2 + w3,
BW2: w3 + w4,
BW3: w4 + w5,
TW1: w1 + w2 + w3,
TW2: w2 + w3 + w4,
TW3: w3 + w4 + w5,
TW4: w4 + w5 + w6,
};
return Object.entries(rawFeature)
.filter(entry => !entry[1].includes(INVALID))
.map(([key, value]) => `${key}:${value}`);
constructor(model: {[key: string]: {[key: string]: number}}) {
this.model = new Map(
Object.entries(model).map(([k, v]) => [k, new Map(Object.entries(v))])
);
}

/**
Expand All @@ -96,19 +59,25 @@ export class Parser {
parse(sentence: string) {
if (sentence === '') return [];
const result = [sentence[0]];
const baseScore = -sum([...this.model.values()]);
const baseScore =
-0.5 *
sum([...this.model.values()].map(group => [...group.values()]).flat());

for (let i = 1; i < sentence.length; i++) {
const feature = Parser.getFeature(
sentence[i - 3] || INVALID,
sentence[i - 2] || INVALID,
sentence[i - 1],
sentence[i],
sentence[i + 1] || INVALID,
sentence[i + 2] || INVALID
);
const score =
baseScore + 2 * sum(feature.map(f => this.model.get(f) || 0));
let score = baseScore;
score += this.model.get('UW1')?.get(sentence.slice(i - 3, i - 2)) || 0;
score += this.model.get('UW2')?.get(sentence.slice(i - 2, i - 1)) || 0;
score += this.model.get('UW3')?.get(sentence.slice(i - 1, i)) || 0;
score += this.model.get('UW4')?.get(sentence.slice(i, i + 1)) || 0;
score += this.model.get('UW5')?.get(sentence.slice(i + 1, i + 2)) || 0;
score += this.model.get('UW6')?.get(sentence.slice(i + 2, i + 3)) || 0;
score += this.model.get('BW1')?.get(sentence.slice(i - 2, i)) || 0;
score += this.model.get('BW2')?.get(sentence.slice(i - 1, i + 1)) || 0;
score += this.model.get('BW3')?.get(sentence.slice(i, i + 2)) || 0;
score += this.model.get('TW1')?.get(sentence.slice(i - 3, i)) || 0;
score += this.model.get('TW2')?.get(sentence.slice(i - 2, i + 1)) || 0;
score += this.model.get('TW3')?.get(sentence.slice(i - 1, i + 2)) || 0;
score += this.model.get('TW4')?.get(sentence.slice(i, i + 3)) || 0;
if (score > 0) result.push('');
result[result.length - 1] += sentence[i];
}
Expand Down Expand Up @@ -150,23 +119,23 @@ export class Parser {
* @returns A parser with the default Japanese model.
*/
export const loadDefaultJapaneseParser = () => {
return new Parser(new Map(Object.entries(jaModel)));
return new Parser(jaModel);
};

/**
* Loads a parser equipped with the default Simplified Chinese model.
* @returns A parser with the default Simplified Chinese model.
*/
export const loadDefaultSimplifiedChineseParser = () => {
return new Parser(new Map(Object.entries(zhHansModel)));
return new Parser(zhHansModel);
};

/**
* Loads a parser equipped with the default Traditional Chinese model.
* @returns A parser with the default Traditional Chinese model.
*/
export const loadDefaultTraditionalChineseParser = () => {
return new Parser(new Map(Object.entries(zhHantModel)));
return new Parser(zhHantModel);
};

/**
Expand Down
2 changes: 1 addition & 1 deletion javascript/src/tests/models/separate_right_before_a.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"UW4:a": 1001}
{"UW4": {"a": 1001}}
65 changes: 19 additions & 46 deletions javascript/src/tests/test_parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,64 +17,38 @@
import 'jasmine';
import {JSDOM} from 'jsdom';
import {Parser} from '../parser.js';
import {INVALID} from '../utils.js';

describe('Parser.getFeature', () => {
const feature = Parser.getFeature('a', 'b', 'c', 'd', 'e', 'f');

it('should include certain features.', () => {
expect(feature).toContain('UW1:a');
expect(feature).toContain('BW1:bc');
expect(feature).toContain('TW1:abc');
});
});

describe('Parser.getFeature with invalid inputs.', () => {
const feature = Parser.getFeature('a', 'a', INVALID, 'a', 'a', 'a');
const findByPrefix = (prefix: string, feature: string[]) => {
for (const item of feature) {
if (item.startsWith(prefix)) return true;
}
return false;
};
it('should not include invalid features.', () => {
expect(findByPrefix('UW3:', feature)).toBeFalse();
expect(findByPrefix('BW2:', feature)).toBeFalse();
});
});

describe('Parser.parse', () => {
const TEST_SENTENCE = 'abcdeabcd';

it('should separate if a strong feature item supports.', () => {
const model = new Map([
['UW4:a', 10000], // means "should separate right before 'a'".
]);
const model = {
UW4: {a: 10000}, // means "should separate right before 'a'".
};
const parser = new Parser(model);
const result = parser.parse(TEST_SENTENCE);
expect(result).toEqual(['abcde', 'abcd']);
});

it('should separate even if it makes a phrase of one character.', () => {
const model = new Map([
['UW4:b', 10000], // means "should separate right before 'b'".
]);
const model = {
UW4: {b: 10000}, // means "should separate right before 'b'".
};
const parser = new Parser(model);
const result = parser.parse(TEST_SENTENCE);
expect(result).toEqual(['a', 'bcdea', 'bcd']);
});

it('should return an empty list when the input is a blank string.', () => {
const model = new Map();
const parser = new Parser(model);
const parser = new Parser({});
const result = parser.parse('');
expect(result).toEqual([]);
});
});

describe('Parser.applyElement', () => {
const checkEqual = (
model: Map<string, number>,
model: {[key: string]: {[key: string]: number}},
inputHTML: string,
expectedHTML: string
) => {
Expand All @@ -96,29 +70,29 @@ describe('Parser.applyElement', () => {
const expectedHTML = `
<p style="word-break: keep-all; overflow-wrap: break-word;"
>xyz<wbr>abc<wbr>abc</p>`;
const model = new Map([
['UW4:a', 1001], // means "should separate right before 'a'".
]);
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
checkEqual(model, inputHTML, expectedHTML);
});

it('should insert WBR tags even it overlaps with other HTML tags.', () => {
const inputHTML = '<p>xy<a href="#">zabca</a>bc</p>';
const expectedHTML = `<p style="word-break: keep-all; overflow-wrap: break-word;"
>xy<a href="#">z<wbr>abc<wbr>a</a>bc</p>`;
const model = new Map([
['UW4:a', 1001], // means "should separate right before 'a'".
]);
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
checkEqual(model, inputHTML, expectedHTML);
});
});

describe('Parser.translateHTMLString', () => {
const defaultModel = new Map([
['UW4:a', 1001], // means "should separate right before 'a'".
]);
const defaultModel = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
const checkEqual = (
model: Map<string, number>,
model: {[key: string]: {[key: string]: number}},
inputHTML: string,
expectedHTML: string
) => {
Expand Down Expand Up @@ -150,8 +124,7 @@ describe('Parser.translateHTMLString', () => {
it('should return a blank string if the input is blank.', () => {
const inputHTML = '';
const expectedHTML = '';
const model = new Map();
checkEqual(model, inputHTML, expectedHTML);
checkEqual({}, inputHTML, expectedHTML);
});

it('should pass script tags as-is.', () => {
Expand Down
2 changes: 1 addition & 1 deletion javascript/src/webcomponents/budoux-base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ export abstract class BudouXBaseElement extends HTMLElement {
constructor() {
super();

this.parser = new Parser(new Map());
this.parser = new Parser({});
this.shadow = this.attachShadow({mode: 'open'});
const observer = new MutationObserver(this.sync.bind(this));
observer.observe(this, {
Expand Down
Loading

0 comments on commit 4be9bcc

Please sign in to comment.