Skip to content

Commit

Permalink
Add Thai language support (#421)
Browse files Browse the repository at this point in the history
tushuhei authored Dec 20, 2023
1 parent 204a628 commit f43d94c
Showing 10 changed files with 137 additions and 3 deletions.
1 change: 1 addition & 0 deletions budoux/models/th.json

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions budoux/parser.py
Original file line number Diff line number Diff line change
@@ -133,3 +133,14 @@ def load_default_traditional_chinese_parser() -> Parser:
with open(os.path.join(MODEL_DIR, 'zh-hant.json'), encoding='utf-8') as f:
model = json.load(f)
return Parser(model)


def load_default_thai_parser() -> Parser:
"""Loads a parser equipped with the default Thai model.
Returns:
A parser (:obj:`budoux.Parser`).
"""
with open(os.path.join(MODEL_DIR, 'th.json'), encoding='utf-8') as f:
model = json.load(f)
return Parser(model)
4 changes: 2 additions & 2 deletions demo/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion demo/src/app.ts
Original file line number Diff line number Diff line change
@@ -14,17 +14,19 @@
* limitations under the License.
*/

import { loadDefaultJapaneseParser, loadDefaultSimplifiedChineseParser, loadDefaultTraditionalChineseParser } from 'budoux';
import { loadDefaultJapaneseParser, loadDefaultSimplifiedChineseParser, loadDefaultTraditionalChineseParser, loadDefaultThaiParser } from 'budoux';

const parsers = new Map([
['ja', loadDefaultJapaneseParser()],
['zh-hans', loadDefaultSimplifiedChineseParser()],
['zh-hant', loadDefaultTraditionalChineseParser()],
['th', loadDefaultThaiParser()]
]);
const defaultInputs = new Map([
['ja', 'Google の使命は、世界中の情報を<strong>整理</strong>し、<em>世界中の人がアクセス</em>できて使えるようにすることです。'],
['zh-hans', '我们的使命是<strong>整合</strong>全球信息,<em>供大众使用</em>,让人人受益。'],
['zh-hant', '我們的使命是<strong>匯整</strong>全球資訊,<em>供大眾使用</em>,使人人受惠。'],
['th', 'พันธกิจของเราคือการจัดระเบียบข้อมูลในโลกนี้และทำให้เข้าถึงได้ง่ายในทุกที่และมีประโยชน์']
])
const inputTextElement = document.getElementById('input') as HTMLTextAreaElement;
const outputContainerElement = document.getElementById('output') as HTMLElement;
1 change: 1 addition & 0 deletions demo/static/index.html
Original file line number Diff line number Diff line change
@@ -74,6 +74,7 @@ <h1>BudouX 🍇</h1>
<option value="ja">Japanese</option>
<option value="zh-hans">Simplified Chinese</option>
<option value="zh-hant">Traditional Chinese</option>
<option value="th">Thai</option>
</select>
</p>
<textarea id="input"></textarea>
9 changes: 9 additions & 0 deletions java/src/main/java/com/google/budoux/Parser.java
Original file line number Diff line number Diff line change
@@ -84,6 +84,15 @@ public static Parser loadDefaultTraditionalChineseParser() {
return loadByFileName("/models/zh-hant.json");
}

/**
* Loads the default Thai parser.
*
* @return a BudouX parser with the default Thai model.
*/
public static Parser loadDefaultThaiParser() {
return loadByFileName("/models/th.json");
}

/**
* Loads a parser by specifying the model file path.
*
9 changes: 9 additions & 0 deletions javascript/src/index.ts
Original file line number Diff line number Diff line change
@@ -17,6 +17,7 @@
import {model as jaModel} from './data/models/ja.js';
import {model as zhHansModel} from './data/models/zh-hans.js';
import {model as zhHantModel} from './data/models/zh-hant.js';
import {model as thModel} from './data/models/th.js';
import {HTMLProcessingParser} from './html_processor.js';

export {Parser} from './parser.js';
@@ -47,6 +48,13 @@ export const loadDefaultTraditionalChineseParser = () => {
return new HTMLProcessingParser(zhHantModel);
};

/**
* Loads a parser equipped with the default Thai model.
* @returns A parser with the default Thai model.
*/
export const loadDefaultThaiParser = () => {
return new HTMLProcessingParser(thModel);
};
/**
* Loads available default parsers.
* @returns A map between available lang codes and their default parsers.
@@ -56,5 +64,6 @@ export const loadDefaultParsers = () => {
['ja', loadDefaultJapaneseParser()],
['zh-hans', loadDefaultSimplifiedChineseParser()],
['zh-hant', loadDefaultTraditionalChineseParser()],
['th', loadDefaultThaiParser()],
]);
};
33 changes: 33 additions & 0 deletions javascript/src/webcomponents/budoux-th.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/**
* @license
* Copyright 2023 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import {loadDefaultThaiParser} from '../index.js';
import {BudouXBaseElement} from './budoux-base.js';

/**
* BudouX Thai Web component.
*/
export class BudouXThaiElement extends BudouXBaseElement {
/**
* BudouX Thai Web component constructor.
*/
constructor() {
super();
this.parser = loadDefaultThaiParser();
}
}

customElements.define('budoux-th', BudouXThaiElement);
66 changes: 66 additions & 0 deletions scripts/prepare_wisesight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepares a dataset from the Wisesight corpus.
Before running this script, you need to download the Wisesight corpus by running:
$ curl -o wisesight-1000-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-1000-samples-tokenised.label
$ curl -o wisesight-160-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-160-samples-tokenised.label
Then run this command as follows over each file.
$ python scripts/prepare_wisesight.py wisesight-1000-samples-tokenised.label -o source_train.txt
$ python scripts/prepare_wisesight.py wisesight-160-samples-tokenised.label -o source_val.txt
"""
import argparse
import re

import regex


def parse_args() -> argparse.Namespace:
DEFAULT_OUT_PATH = 'source.txt'
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
'source_filepath', help='Path to a Wisesight corpus label file.')
parser.add_argument(
'-o',
'--outfile',
help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})',
default=DEFAULT_OUT_PATH)
return parser.parse_args()


def main() -> None:
args = parse_args()
source_filepath = args.source_filepath
target_filepath = args.outfile

with open(target_filepath, 'w') as outfile:
with open(source_filepath) as infile:
for line in infile:
line = line.strip()
line = re.sub(r'https?://[^ ]+', '', line) # Remove URLs
line = re.sub(r'#[^ ]+', '', line) # Remove hashtags
line = regex.compile(r'\p{Emoji_Presentation=Yes}+').sub(
'', line) # Remove emojis
line = re.sub(r'\|+', '|', line) # Remove consecutive separators
line = re.sub(r'(\|\s)*\|$', '', line) # Remove redundant spaces
outfile.write(line.replace('|', '▁') + '\n') # Replace the separators.
print('\033[92mTraining data is output to: %s\033[0m' % (target_filepath))


if __name__ == '__main__':
main()
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -28,8 +28,10 @@ dev =
isort
mypy==1.7.1
pytest
regex
toml
twine
types-regex
types-setuptools
yapf

0 comments on commit f43d94c

Please sign in to comment.