Add Thai language support (#421)

google · Dec 20, 2023 · f43d94c · f43d94c
1 parent 204a628
commit f43d94c
Showing 10 changed files with 137 additions and 3 deletions.
diff --git a/budoux/models/th.json b/budoux/models/th.json
diff --git a/budoux/parser.py b/budoux/parser.py
@@ -133,3 +133,14 @@ def load_default_traditional_chinese_parser() -> Parser:
   with open(os.path.join(MODEL_DIR, 'zh-hant.json'), encoding='utf-8') as f:
     model = json.load(f)
   return Parser(model)
+
+
+def load_default_thai_parser() -> Parser:
+  """Loads a parser equipped with the default Thai model.
+
+  Returns:
+    A parser (:obj:`budoux.Parser`).
+  """
+  with open(os.path.join(MODEL_DIR, 'th.json'), encoding='utf-8') as f:
+    model = json.load(f)
+  return Parser(model)
diff --git a/demo/package-lock.json b/demo/package-lock.json
diff --git a/demo/src/app.ts b/demo/src/app.ts
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-import { loadDefaultJapaneseParser, loadDefaultSimplifiedChineseParser, loadDefaultTraditionalChineseParser } from 'budoux';
+import { loadDefaultJapaneseParser, loadDefaultSimplifiedChineseParser, loadDefaultTraditionalChineseParser, loadDefaultThaiParser } from 'budoux';
 
 const parsers = new Map([
   ['ja', loadDefaultJapaneseParser()],
   ['zh-hans', loadDefaultSimplifiedChineseParser()],
   ['zh-hant', loadDefaultTraditionalChineseParser()],
+  ['th', loadDefaultThaiParser()]
 ]);
 const defaultInputs = new Map([
   ['ja', 'Google の使命は、世界中の情報を<strong>整理</strong>し、<em>世界中の人がアクセス</em>できて使えるようにすることです。'],
   ['zh-hans', '我们的使命是<strong>整合</strong>全球信息，<em>供大众使用</em>，让人人受益。'],
   ['zh-hant', '我們的使命是<strong>匯整</strong>全球資訊，<em>供大眾使用</em>，使人人受惠。'],
+  ['th', 'พันธกิจของเราคือการจัดระเบียบข้อมูลในโลกนี้และทำให้เข้าถึงได้ง่ายในทุกที่และมีประโยชน์']
 ])
 const inputTextElement = document.getElementById('input') as HTMLTextAreaElement;
 const outputContainerElement = document.getElementById('output') as HTMLElement;

diff --git a/demo/static/index.html b/demo/static/index.html
@@ -74,6 +74,7 @@ <h1>BudouX 🍇</h1>
           <option value="ja">Japanese</option>
           <option value="zh-hans">Simplified Chinese</option>
           <option value="zh-hant">Traditional Chinese</option>
+          <option value="th">Thai</option>
         </select>
       </p>
       <textarea id="input"></textarea>

diff --git a/java/src/main/java/com/google/budoux/Parser.java b/java/src/main/java/com/google/budoux/Parser.java
@@ -84,6 +84,15 @@ public static Parser loadDefaultTraditionalChineseParser() {
     return loadByFileName("/models/zh-hant.json");
   }
 
+  /**
+   * Loads the default Thai parser.
+   *
+   * @return a BudouX parser with the default Thai model.
+   */
+  public static Parser loadDefaultThaiParser() {
+    return loadByFileName("/models/th.json");
+  }
+
   /**
    * Loads a parser by specifying the model file path.
    *

diff --git a/javascript/src/index.ts b/javascript/src/index.ts
@@ -17,6 +17,7 @@
 import {model as jaModel} from './data/models/ja.js';
 import {model as zhHansModel} from './data/models/zh-hans.js';
 import {model as zhHantModel} from './data/models/zh-hant.js';
+import {model as thModel} from './data/models/th.js';
 import {HTMLProcessingParser} from './html_processor.js';
 
 export {Parser} from './parser.js';
@@ -47,6 +48,13 @@ export const loadDefaultTraditionalChineseParser = () => {
   return new HTMLProcessingParser(zhHantModel);
 };
 
+/**
+ * Loads a parser equipped with the default Thai model.
+ * @returns A parser with the default Thai model.
+ */
+export const loadDefaultThaiParser = () => {
+  return new HTMLProcessingParser(thModel);
+};
 /**
  * Loads available default parsers.
  * @returns A map between available lang codes and their default parsers.
@@ -56,5 +64,6 @@ export const loadDefaultParsers = () => {
     ['ja', loadDefaultJapaneseParser()],
     ['zh-hans', loadDefaultSimplifiedChineseParser()],
     ['zh-hant', loadDefaultTraditionalChineseParser()],
+    ['th', loadDefaultThaiParser()],
   ]);
 };
diff --git a/javascript/src/webcomponents/budoux-th.ts b/javascript/src/webcomponents/budoux-th.ts
@@ -0,0 +1,33 @@
+/**
+ * @license
+ * Copyright 2023 Google LLC
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import {loadDefaultThaiParser} from '../index.js';
+import {BudouXBaseElement} from './budoux-base.js';
+
+/**
+ * BudouX Thai Web component.
+ */
+export class BudouXThaiElement extends BudouXBaseElement {
+  /**
+   * BudouX Thai Web component constructor.
+   */
+  constructor() {
+    super();
+    this.parser = loadDefaultThaiParser();
+  }
+}
+
+customElements.define('budoux-th', BudouXThaiElement);
diff --git a/scripts/prepare_wisesight.py b/scripts/prepare_wisesight.py
@@ -0,0 +1,66 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepares a dataset from the Wisesight corpus.
+
+Before running this script, you need to download the Wisesight corpus by running:
+
+$ curl -o wisesight-1000-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-1000-samples-tokenised.label
+$ curl -o wisesight-160-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-160-samples-tokenised.label
+
+Then run this command as follows over each file.
+
+$ python scripts/prepare_wisesight.py wisesight-1000-samples-tokenised.label -o source_train.txt
+$ python scripts/prepare_wisesight.py wisesight-160-samples-tokenised.label -o source_val.txt
+"""
+import argparse
+import re
+
+import regex
+
+
+def parse_args() -> argparse.Namespace:
+  DEFAULT_OUT_PATH = 'source.txt'
+  parser = argparse.ArgumentParser(
+      description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
+  parser.add_argument(
+      'source_filepath', help='Path to a Wisesight corpus label file.')
+  parser.add_argument(
+      '-o',
+      '--outfile',
+      help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})',
+      default=DEFAULT_OUT_PATH)
+  return parser.parse_args()
+
+
+def main() -> None:
+  args = parse_args()
+  source_filepath = args.source_filepath
+  target_filepath = args.outfile
+
+  with open(target_filepath, 'w') as outfile:
+    with open(source_filepath) as infile:
+      for line in infile:
+        line = line.strip()
+        line = re.sub(r'https?://[^ ]+', '', line)  # Remove URLs
+        line = re.sub(r'#[^ ]+', '', line)  # Remove hashtags
+        line = regex.compile(r'\p{Emoji_Presentation=Yes}+').sub(
+            '', line)  # Remove emojis
+        line = re.sub(r'\|+', '|', line)  # Remove consecutive separators
+        line = re.sub(r'(\|\s)*\|$', '', line)  # Remove redundant spaces
+        outfile.write(line.replace('|', '▁') + '\n')  # Replace the separators.
+  print('\033[92mTraining data is output to: %s\033[0m' % (target_filepath))
+
+
+if __name__ == '__main__':
+  main()
diff --git a/setup.cfg b/setup.cfg
@@ -28,8 +28,10 @@ dev =
     isort
     mypy==1.7.1
     pytest
+    regex
     toml
     twine
+    types-regex
     types-setuptools
     yapf