Skip to content

Commit

Permalink
Remove Unicode Block features (#86)
Browse files Browse the repository at this point in the history
  • Loading branch information
tushuhei authored Nov 10, 2022
1 parent d27ce10 commit ab7b522
Show file tree
Hide file tree
Showing 12 changed files with 7 additions and 225 deletions.
39 changes: 0 additions & 39 deletions budoux/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,10 @@
# limitations under the License.
"""Methods to encode source sentences to features."""

import bisect
import json
import os
import typing

from .utils import INVALID

with open(os.path.join(os.path.dirname(__file__), 'unicode_blocks.json')) as f:
block_starts: typing.List[int] = json.load(f)


def unicode_block_index(w: str) -> str:
"""Returns the index of the Unicode block that the character belongs to.
Args:
w (str): A character.
Returns:
index (str): Unicode block index in three digits.
"""
if not w or w == INVALID:
return INVALID
return '%03d' % (bisect.bisect_right(block_starts, ord(w[0])))


def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
w6: str) -> typing.List[str]:
Expand All @@ -54,12 +34,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
The feature (list[str]).
"""
b1 = unicode_block_index(w1)
b2 = unicode_block_index(w2)
b3 = unicode_block_index(w3)
b4 = unicode_block_index(w4)
b5 = unicode_block_index(w5)
b6 = unicode_block_index(w6)
raw_feature = {
'UW1': w1,
'UW2': w2,
Expand All @@ -74,19 +48,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
'TW2': w2 + w3 + w4,
'TW3': w3 + w4 + w5,
'TW4': w4 + w5 + w6,
'UB1': b1,
'UB2': b2,
'UB3': b3,
'UB4': b4,
'UB5': b5,
'UB6': b6,
'BB1': b2 + b3,
'BB2': b3 + b4,
'BB3': b4 + b5,
'TB1': b1 + b2 + b3,
'TB2': b2 + b3 + b4,
'TB3': b3 + b4 + b5,
'TB4': b4 + b5 + b6,
}
for key, value in list(raw_feature.items()):
if INVALID in value:
Expand Down
2 changes: 1 addition & 1 deletion budoux/models/ja-knbc.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion budoux/models/zh-hans.json

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion budoux/unicode_blocks.json

This file was deleted.

10 changes: 0 additions & 10 deletions javascript/scripts/copy-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,6 @@ const PROJECT_ROOT = path.join(__dirname, '..', '..');
const DATA_DIR = path.join(PROJECT_ROOT, 'javascript', 'src', 'data');
fs.mkdirSync(path.join(DATA_DIR, 'models'), {recursive: true});

const copyUnicodeBlocks = () => {
const sourcePath = path.join(PROJECT_ROOT, 'budoux', 'unicode_blocks.json');
const targetPath = path.join(DATA_DIR, 'unicode_blocks.ts');
fs.writeFileSync(
targetPath,
`export const unicodeBlocks = ${fs.readFileSync(sourcePath)}`
);
};

const copyModels = () => {
const modelsDirPath = path.join(PROJECT_ROOT, 'budoux', 'models');
const files = fs.readdirSync(modelsDirPath);
Expand Down Expand Up @@ -57,7 +48,6 @@ const copySkipNodes = () => {
};

const main = () => {
copyUnicodeBlocks();
copyModels();
copySkipNodes();
};
Expand Down
36 changes: 1 addition & 35 deletions javascript/src/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,11 @@
* limitations under the License.
*/

import {unicodeBlocks} from './data/unicode_blocks.js';
import {model as jaKNBCModel} from './data/models/ja-knbc.js';
import {model as zhHansModel} from './data/models/zh-hans.js';
import {parseFromString} from './dom.js';
import {HTMLProcessor} from './html_processor.js';
import {bisectRight, INVALID, sum} from './utils.js';
import {INVALID, sum} from './utils.js';

// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
// but we define the same here for Node.js environments.
Expand All @@ -35,20 +34,6 @@ export class Parser {
this.model = model;
}

/**
* Generates a Unicode Block feature from the given character.
*
* @param w A character input.
* @returns A Unicode Block feature.
*/
static getUnicodeBlockFeature(w: string) {
if (!w || w === INVALID) return INVALID;
const cp = w.codePointAt(0);
if (cp === undefined) return INVALID;
const bn = bisectRight(unicodeBlocks, cp);
return `${bn}`.padStart(3, '0');
}

/**
* Generates a feature from characters around (w1-w6).
*
Expand All @@ -68,12 +53,6 @@ export class Parser {
w5: string,
w6: string
) {
const b1 = Parser.getUnicodeBlockFeature(w1);
const b2 = Parser.getUnicodeBlockFeature(w2);
const b3 = Parser.getUnicodeBlockFeature(w3);
const b4 = Parser.getUnicodeBlockFeature(w4);
const b5 = Parser.getUnicodeBlockFeature(w5);
const b6 = Parser.getUnicodeBlockFeature(w6);
const rawFeature = {
UW1: w1,
UW2: w2,
Expand All @@ -88,19 +67,6 @@ export class Parser {
TW2: w2 + w3 + w4,
TW3: w3 + w4 + w5,
TW4: w4 + w5 + w6,
UB1: b1,
UB2: b2,
UB3: b3,
UB4: b4,
UB5: b5,
UB6: b6,
BB1: b2 + b3,
BB2: b3 + b4,
BB3: b4 + b5,
TB1: b1 + b2 + b3,
TB2: b2 + b3 + b4,
TB3: b3 + b4 + b5,
TB4: b4 + b5 + b6,
};
return Object.entries(rawFeature)
.filter(entry => !entry[1].includes(INVALID))
Expand Down
4 changes: 1 addition & 3 deletions javascript/src/tests/test_html_processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ describe('HTMLProcessor.applyToElement.separator.node', () => {
expect(document.body.innerHTML).toEqual(
'<div class="applied">今日は' +
'<span style="white-space: normal;">\u200B</span>' +
'良い' +
'<span style="white-space: normal;">\u200B</span>' +
'天気</div>'
'良い天気</div>'
);
});
});
Expand Down
32 changes: 0 additions & 32 deletions javascript/src/tests/test_parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,43 +19,13 @@ import {JSDOM} from 'jsdom';
import {Parser} from '../parser.js';
import {INVALID} from '../utils.js';

describe('Parser.getUnicodeBlockFeature', () => {
const testFeature = (character: string, feature: string) => {
const result = Parser.getUnicodeBlockFeature(character);
expect(result).toBe(feature);
};
it('"a" should be the 1st block "Basic Latin".', () => {
testFeature('a', '001');
});
it('"あ" should be the 108th block "Hiragana".', () => {
testFeature('あ', '108');
});
it('"安" should be the 120th block "Kanji"', () => {
testFeature('安', '120');
});
it('Only the first character should be recoghnized', () => {
testFeature('あ安', '108');
});
it('Should return INVALID when a blank string is given.', () => {
testFeature('', INVALID);
});
it('Should return INVALID when INVALID is given.', () => {
testFeature(INVALID, INVALID);
});
});

describe('Parser.getFeature', () => {
const feature = Parser.getFeature('a', 'b', 'c', 'd', 'e', 'f');

it('should include certain features.', () => {
expect(feature).toContain('UW1:a');
expect(feature).toContain('UB1:001');

expect(feature).toContain('BW1:bc');
expect(feature).toContain('BB1:001001');

expect(feature).toContain('TW1:abc');
expect(feature).toContain('TB1:001001001');
});
});

Expand All @@ -69,9 +39,7 @@ describe('Parser.getFeature with invalid inputs.', () => {
};
it('should not include invalid features.', () => {
expect(findByPrefix('UW3:', feature)).toBeFalse();
expect(findByPrefix('UB3:', feature)).toBeFalse();
expect(findByPrefix('BW2:', feature)).toBeFalse();
expect(findByPrefix('BB2:', feature)).toBeFalse();
});
});

Expand Down
42 changes: 0 additions & 42 deletions javascript/src/tests/test_utils.ts

This file was deleted.

22 changes: 0 additions & 22 deletions javascript/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,6 @@
* limitations under the License.
*/

/**
* Finds the insertion point maintaining the sorted order with a basic
* bisection algorithm. This works the same as Python's bisect.bisect_right
* method.
*
* @param arr The sorted array.
* @param i The item to check the insertion point.
* @returns The insertion point.
*/
export const bisectRight = (arr: number[], i: number): number => {
const mid = Math.floor(arr.length / 2);
if (i === arr[mid]) {
return mid + 1;
} else if (i < arr[mid]) {
if (arr.length === 1) return 0;
return bisectRight(arr.slice(0, mid), i);
} else {
if (arr.length === 1) return 1;
return mid + bisectRight(arr.slice(mid), i);
}
};

/**
* Finds the sum of the numbers in the list.
* @param arr The list of numbers.
Expand Down
40 changes: 2 additions & 38 deletions tests/test_feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,6 @@

class TestFeatureExtractor(unittest.TestCase):

def test_unicode_block_index(self) -> None:

def check(character: str, block: str, msg: str) -> None:
self.assertEqual(
feature_extractor.unicode_block_index(character), block, msg)

check('a', '001', '"a" should be the 1st block "Basic Latin".')
check('あ', '108', '"あ" should be the 108th block "Hiragana".')
check('安', '120', '"安" should be the 120th block "Kanji"')
check('あ安', '108', 'Only the first character should be recognized')
check('', utils.INVALID,
'Should return INVALID when a blank string is given.')
check(utils.INVALID, utils.INVALID,
'Should return INVALID when INVALID is given.')

def test_get_feature(self) -> None:
feature = feature_extractor.get_feature('a', 'b', 'c', 'd', 'e', 'f')
self.assertSetEqual(
Expand All @@ -49,31 +34,16 @@ def test_get_feature(self) -> None:
'UW5:e',
'UW6:f',

# Unigram of Unicode Blocks (UB)
'UB1:001',
'UB2:001',
'UB3:001',
'UB4:001',
'UB5:001',
'UB6:001',

# Bigram of Words (BW) and Unicode Blocks (BB)
# Bigram of Words (BW)
'BW1:bc',
'BW2:cd',
'BW3:de',
'BB1:001001',
'BB2:001001',
'BB3:001001',

# Trigram of Words (TW) and Unicode Blocks (TB)
# Trigram of Words (TW)
'TW1:abc',
'TW2:bcd',
'TW3:cde',
'TW4:def',
'TB1:001001001',
'TB2:001001001',
'TB3:001001001',
'TB4:001001001',
},
'Features should be extracted.')

Expand All @@ -88,15 +58,9 @@ def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool:
self.assertFalse(
find_by_prefix('UW3:', feature),
'Should omit the Unigram feature when the character is invalid.')
self.assertFalse(
find_by_prefix('UB3:', feature),
'Should omit the Unicode block feature when the character is invalid.')
self.assertFalse(
find_by_prefix('BW2:', feature),
'Should omit the Bigram feature that covers an invalid character.')
self.assertFalse(
find_by_prefix('BB2:', feature),
'Should omit the Unicode feature that covers an invalid character.')


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_cmdargs_lang_ja(self) -> None:
cmdargs = ['-l', 'ja', '今日はいい天気ですね。']
output = main._main(cmdargs)

self.assertEqual(output, '今日は\nいい\n天気ですね。')
self.assertEqual(output, '今日は\nいい天気ですね。')

def test_cmdargs_lang_zh_hans(self) -> None:
cmdargs = ['-l', 'zh-hans', '今天天气晴朗。']
Expand Down

0 comments on commit ab7b522

Please sign in to comment.