Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Thai language support #421

Merged
merged 4 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions budoux/models/th.json

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions budoux/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,14 @@ def load_default_traditional_chinese_parser() -> Parser:
with open(os.path.join(MODEL_DIR, 'zh-hant.json'), encoding='utf-8') as f:
model = json.load(f)
return Parser(model)


def load_default_thai_parser() -> Parser:
"""Loads a parser equipped with the default Thai model.

Returns:
A parser (:obj:`budoux.Parser`).
"""
with open(os.path.join(MODEL_DIR, 'th.json'), encoding='utf-8') as f:
model = json.load(f)
return Parser(model)
4 changes: 2 additions & 2 deletions demo/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion demo/src/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,19 @@
* limitations under the License.
*/

import { loadDefaultJapaneseParser, loadDefaultSimplifiedChineseParser, loadDefaultTraditionalChineseParser } from 'budoux';
import { loadDefaultJapaneseParser, loadDefaultSimplifiedChineseParser, loadDefaultTraditionalChineseParser, loadDefaultThaiParser } from 'budoux';

const parsers = new Map([
['ja', loadDefaultJapaneseParser()],
['zh-hans', loadDefaultSimplifiedChineseParser()],
['zh-hant', loadDefaultTraditionalChineseParser()],
['th', loadDefaultThaiParser()]
]);
const defaultInputs = new Map([
['ja', 'Google の使命は、世界中の情報を<strong>整理</strong>し、<em>世界中の人がアクセス</em>できて使えるようにすることです。'],
['zh-hans', '我们的使命是<strong>整合</strong>全球信息,<em>供大众使用</em>,让人人受益。'],
['zh-hant', '我們的使命是<strong>匯整</strong>全球資訊,<em>供大眾使用</em>,使人人受惠。'],
['th', 'พันธกิจของเราคือการจัดระเบียบข้อมูลในโลกนี้และทำให้เข้าถึงได้ง่ายในทุกที่และมีประโยชน์']
])
const inputTextElement = document.getElementById('input') as HTMLTextAreaElement;
const outputContainerElement = document.getElementById('output') as HTMLElement;
Expand Down
1 change: 1 addition & 0 deletions demo/static/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ <h1>BudouX 🍇</h1>
<option value="ja">Japanese</option>
<option value="zh-hans">Simplified Chinese</option>
<option value="zh-hant">Traditional Chinese</option>
<option value="th">Thai</option>
</select>
</p>
<textarea id="input"></textarea>
Expand Down
9 changes: 9 additions & 0 deletions java/src/main/java/com/google/budoux/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ public static Parser loadDefaultTraditionalChineseParser() {
return loadByFileName("/models/zh-hant.json");
}

/**
* Loads the default Thai parser.
*
* @return a BudouX parser with the default Thai model.
*/
public static Parser loadDefaultThaiParser() {
return loadByFileName("/models/th.json");
}

/**
* Loads a parser by specifying the model file path.
*
Expand Down
9 changes: 9 additions & 0 deletions javascript/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import {model as jaModel} from './data/models/ja.js';
import {model as zhHansModel} from './data/models/zh-hans.js';
import {model as zhHantModel} from './data/models/zh-hant.js';
import {model as thModel} from './data/models/th.js';
import {HTMLProcessingParser} from './html_processor.js';

export {Parser} from './parser.js';
Expand Down Expand Up @@ -47,6 +48,13 @@ export const loadDefaultTraditionalChineseParser = () => {
return new HTMLProcessingParser(zhHantModel);
};

/**
* Loads a parser equipped with the default Thai model.
* @returns A parser with the default Thai model.
*/
export const loadDefaultThaiParser = () => {
return new HTMLProcessingParser(thModel);
};
/**
* Loads available default parsers.
* @returns A map between available lang codes and their default parsers.
Expand All @@ -56,5 +64,6 @@ export const loadDefaultParsers = () => {
['ja', loadDefaultJapaneseParser()],
['zh-hans', loadDefaultSimplifiedChineseParser()],
['zh-hant', loadDefaultTraditionalChineseParser()],
['th', loadDefaultThaiParser()],
]);
};
33 changes: 33 additions & 0 deletions javascript/src/webcomponents/budoux-th.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/**
* @license
* Copyright 2023 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import {loadDefaultThaiParser} from '../index.js';
import {BudouXBaseElement} from './budoux-base.js';

/**
* BudouX Thai Web component.
*/
export class BudouXThaiElement extends BudouXBaseElement {
/**
* BudouX Thai Web component constructor.
*/
constructor() {
super();
this.parser = loadDefaultThaiParser();
}
}

customElements.define('budoux-th', BudouXThaiElement);
66 changes: 66 additions & 0 deletions scripts/prepare_wisesight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepares a dataset from the Wisesight corpus.

Before running this script, you need to download the Wisesight corpus by running:

$ curl -o wisesight-1000-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-1000-samples-tokenised.label
$ curl -o wisesight-160-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-160-samples-tokenised.label

Then run this command as follows over each file.

$ python scripts/prepare_wisesight.py wisesight-1000-samples-tokenised.label -o source_train.txt
$ python scripts/prepare_wisesight.py wisesight-160-samples-tokenised.label -o source_val.txt
"""
import argparse
import re

import regex


def parse_args() -> argparse.Namespace:
DEFAULT_OUT_PATH = 'source.txt'
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
'source_filepath', help='Path to a Wisesight corpus label file.')
parser.add_argument(
'-o',
'--outfile',
help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})',
default=DEFAULT_OUT_PATH)
return parser.parse_args()


def main() -> None:
args = parse_args()
source_filepath = args.source_filepath
target_filepath = args.outfile

with open(target_filepath, 'w') as outfile:
with open(source_filepath) as infile:
for line in infile:
line = line.strip()
line = re.sub(r'https?://[^ ]+', '', line) # Remove URLs
line = re.sub(r'#[^ ]+', '', line) # Remove hashtags
line = regex.compile(r'\p{Emoji_Presentation=Yes}+').sub(
'', line) # Remove emojis
line = re.sub(r'\|+', '|', line) # Remove consecutive separators
line = re.sub(r'(\|\s)*\|$', '', line) # Remove redundant spaces
outfile.write(line.replace('|', '▁') + '\n') # Replace the separators.
print('\033[92mTraining data is output to: %s\033[0m' % (target_filepath))


if __name__ == '__main__':
main()
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ dev =
isort
mypy==1.7.1
pytest
regex
toml
twine
types-regex
types-setuptools
yapf

Expand Down
Loading