diff --git a/source/notebooks/pythainlp_chunk.ipynb b/source/notebooks/pythainlp_chunk.ipynb
index 9ea47f9..931095b 100644
--- a/source/notebooks/pythainlp_chunk.ipynb
+++ b/source/notebooks/pythainlp_chunk.ipynb
@@ -1,275 +1,328 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "eCfShB9fUSqO"
+ },
+ "source": [
+ "# Thai Chunk Parser\n",
+ "\n",
+ "This tutorial demonstrates how to use the `chunk_parse` function from the PyThaiNLP library for parsing Thai text into phrases. We will use a chunking model trained on ORCHID++ corpus. \n",
+ "\n",
+ "Read more: https://github.com/PyThaiNLP/pythainlp/pull/524"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We will need the following libraries and packages: \n",
+ "- PyThaiNLP\n",
+ "- NLTK (to preprocess chunk data for visualization)\n",
+ "- svgling (for visualization)\n",
+ "- python-crfsuite"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
"colab": {
- "provenance": []
+ "base_uri": "https://localhost:8080/"
},
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3"
+ "id": "JvwrS6MDhitW",
+ "outputId": "ab197d92-b537-4974-e1b5-6bdaa7b8cefd"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: pythainlp in /usr/local/lib/python3.10/dist-packages (4.0.2)\n",
+ "Requirement already satisfied: svgling in /usr/local/lib/python3.10/dist-packages (0.3.1)\n",
+ "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n",
+ "Collecting python-crfsuite\n",
+ " Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m993.5/993.5 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.10/dist-packages (from pythainlp) (2.31.0)\n",
+ "Requirement already satisfied: svgwrite in /usr/local/lib/python3.10/dist-packages (from svgling) (1.4.3)\n",
+ "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.6)\n",
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.3.2)\n",
+ "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.6.3)\n",
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.0)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (3.2.0)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (3.4)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (2.0.4)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (2023.7.22)\n",
+ "Installing collected packages: python-crfsuite\n",
+ "Successfully installed python-crfsuite-0.9.9\n"
+ ]
}
+ ],
+ "source": [
+ "!pip install pythainlp svgling nltk python-crfsuite"
+ ]
},
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "eCfShB9fUSqO"
- },
- "source": [
- "# Thai Chunk Parser\n",
- "\n",
- "In PyThaiNLP, We use chunk data from ORCHID++ corpus.\n",
- "\n",
- "Read more: https://github.com/PyThaiNLP/pythainlp/pull/524"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "JvwrS6MDhitW",
- "outputId": "ab197d92-b537-4974-e1b5-6bdaa7b8cefd"
- },
- "source": [
- "!pip install pythainlp svgling nltk python-crfsuite"
- ],
- "execution_count": 5,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Requirement already satisfied: pythainlp in /usr/local/lib/python3.10/dist-packages (4.0.2)\n",
- "Requirement already satisfied: svgling in /usr/local/lib/python3.10/dist-packages (0.3.1)\n",
- "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n",
- "Collecting python-crfsuite\n",
- " Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m993.5/993.5 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.10/dist-packages (from pythainlp) (2.31.0)\n",
- "Requirement already satisfied: svgwrite in /usr/local/lib/python3.10/dist-packages (from svgling) (1.4.3)\n",
- "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.6)\n",
- "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.3.2)\n",
- "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.6.3)\n",
- "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.0)\n",
- "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (3.2.0)\n",
- "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (3.4)\n",
- "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (2.0.4)\n",
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (2023.7.22)\n",
- "Installing collected packages: python-crfsuite\n",
- "Successfully installed python-crfsuite-0.9.9\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "ZPRRhKxrhlFA"
- },
- "source": [
- "from pythainlp.tokenize import word_tokenize\n",
- "from pythainlp.tag import pos_tag\n",
- "from pythainlp.tag import chunk_parse\n",
- "from nltk.chunk import conlltags2tree\n",
- "import svgling"
- ],
- "execution_count": 1,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "bGD2uxMFhmh4"
- },
- "source": [
- "def test(txt):\n",
- " m = [(w,t) for w,t in pos_tag(word_tokenize(txt), engine= 'perceptron',corpus = 'orchid')]\n",
- " tag = chunk_parse(m)\n",
- " p = [(w,t,tag[i]) for i,(w,t) in enumerate(m)]\n",
- " return p"
- ],
- "execution_count": 2,
- "outputs": []
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We need to import the following modules and functions:\n",
+ "- `word_tokenize` – this function takes a Thai text and returns a list of tokenized words\n",
+ "- `pos_tag` – this function takes a list of tokenized words and marks them with part-of-speech (POS) tags\n",
+ "- `chunk_parse` – this function takes words with their POS tags and marks them with inside-outside-beginning (IOB) tags\n",
+ "- `conlltags2tree` – this function is part of the NLTK and converts IOB format to a tree\n",
+ "- `svgling` – this package will be used to visualize the tree in SVG\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "ZPRRhKxrhlFA"
+ },
+ "outputs": [],
+ "source": [
+ "from pythainlp.tokenize import word_tokenize\n",
+ "from pythainlp.tag import pos_tag\n",
+ "from pythainlp.tag import chunk_parse\n",
+ "from nltk.chunk import conlltags2tree\n",
+ "import svgling"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We define a new function `test`, which will first segment the input text into words (`word_tokenize`), tag the words with their parts of speech based on the ORCHID++ corpus (`pos_tag`) and perform chunking (`chunk_parse`). The function then combines the words, POS and IOB tags into a list of triples `p`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "bGD2uxMFhmh4"
+ },
+ "outputs": [],
+ "source": [
+ "def test(txt):\n",
+ " m = [(w,t) for w,t in pos_tag(word_tokenize(txt), engine= 'perceptron',corpus = 'orchid')]\n",
+ " tag = chunk_parse(m)\n",
+ " p = [(w,t,tag[i]) for i,(w,t) in enumerate(m)]\n",
+ " return p"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Finally, we call the `test` function to chunk several example sentences. We then use the `svgling.draw_tree` function to visualize the syntactic trees, which were generated from the chunked data by the `conlltags2tree` function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 188
},
+ "id": "ag8oszXfhoAZ",
+ "outputId": "b789de88-d812-44ca-d0d9-4f031127b68d"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 188
- },
- "id": "ag8oszXfhoAZ",
- "outputId": "b789de88-d812-44ca-d0d9-4f031127b68d"
- },
- "source": [
- "svgling.draw_tree(conlltags2tree(test(\"แมวกินปลา\")))"
+ "data": {
+ "image/svg+xml": [
+ ""
],
- "execution_count": 3,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "TreeLayout(Tree('S', [Tree('NP', [('แมว', 'NCMN')]), Tree('VP', [('กิน', 'VACT'), ('ปลา', 'NCMN')])]))"
- ],
- "image/svg+xml": "SNPแมวNCMNVPกินVACTปลาNCMN"
- },
- "metadata": {},
- "execution_count": 3
- }
+ "text/plain": [
+ "TreeLayout(Tree('S', [Tree('NP', [('แมว', 'NCMN')]), Tree('VP', [('กิน', 'VACT'), ('ปลา', 'NCMN')])]))"
]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "svgling.draw_tree(conlltags2tree(test(\"แมวกินปลา\")))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 188
},
+ "id": "L3COVriThp3B",
+ "outputId": "27256b8d-f265-49cb-c5f1-85fee90b79e4"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 188
- },
- "id": "L3COVriThp3B",
- "outputId": "27256b8d-f265-49cb-c5f1-85fee90b79e4"
- },
- "source": [
- "svgling.draw_tree(conlltags2tree(test(\"คนหนองคายเป็นคนน่ารัก\")))"
+ "data": {
+ "image/svg+xml": [
+ "SNPคนNCMNหนองคายNCMNVPเป็นVSTAคนNCMNน่ารักVATT"
],
- "execution_count": 4,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "TreeLayout(Tree('S', [Tree('NP', [('คน', 'NCMN'), ('หนองคาย', 'NCMN')]), Tree('VP', [('เป็น', 'VSTA'), ('คน', 'NCMN'), ('น่ารัก', 'VATT')])]))"
- ],
- "image/svg+xml": "SNPคนNCMNหนองคายNCMNVPเป็นVSTAคนNCMNน่ารักVATT"
- },
- "metadata": {},
- "execution_count": 4
- }
+ "text/plain": [
+ "TreeLayout(Tree('S', [Tree('NP', [('คน', 'NCMN'), ('หนองคาย', 'NCMN')]), Tree('VP', [('เป็น', 'VSTA'), ('คน', 'NCMN'), ('น่ารัก', 'VATT')])]))"
]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "svgling.draw_tree(conlltags2tree(test(\"คนหนองคายเป็นคนน่ารัก\")))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 188
},
+ "id": "YwaQNhLPib6Y",
+ "outputId": "1ebc2402-90bf-4a37-8b3e-60b62bb52bae"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "id": "YwaQNhLPib6Y",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 188
- },
- "outputId": "1ebc2402-90bf-4a37-8b3e-60b62bb52bae"
- },
- "source": [
- "svgling.draw_tree(conlltags2tree(test(\"ปลาอะไรอยู่ในน้ำ\")))"
+ "data": {
+ "image/svg+xml": [
+ "SNPปลาNCMNอะไรPNTRVPอยู่VSTAในRPREน้ำNCMN"
],
- "execution_count": 5,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "TreeLayout(Tree('S', [Tree('NP', [('ปลา', 'NCMN'), ('อะไร', 'PNTR')]), Tree('VP', [('อยู่', 'VSTA'), ('ใน', 'RPRE'), ('น้ำ', 'NCMN')])]))"
- ],
- "image/svg+xml": "SNPปลาNCMNอะไรPNTRVPอยู่VSTAในRPREน้ำNCMN"
- },
- "metadata": {},
- "execution_count": 5
- }
+ "text/plain": [
+ "TreeLayout(Tree('S', [Tree('NP', [('ปลา', 'NCMN'), ('อะไร', 'PNTR')]), Tree('VP', [('อยู่', 'VSTA'), ('ใน', 'RPRE'), ('น้ำ', 'NCMN')])]))"
]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "svgling.draw_tree(conlltags2tree(test(\"ปลาอะไรอยู่ในน้ำ\")))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 188
},
+ "id": "PB7AU2febneD",
+ "outputId": "32bfea36-c0e1-484a-dbb6-b77536124507"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 188
- },
- "id": "PB7AU2febneD",
- "outputId": "32bfea36-c0e1-484a-dbb6-b77536124507"
- },
- "source": [
- "svgling.draw_tree(conlltags2tree(test(\"ในน้ำมีอะไรอยู่\")))"
+ "data": {
+ "image/svg+xml": [
+ "SNPในRPREน้ำNCMNVPมีVSTAอะไรPNTRอยู่XVAE"
],
- "execution_count": 6,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "TreeLayout(Tree('S', [Tree('NP', [('ใน', 'RPRE'), ('น้ำ', 'NCMN')]), Tree('VP', [('มี', 'VSTA'), ('อะไร', 'PNTR'), ('อยู่', 'XVAE')])]))"
- ],
- "image/svg+xml": "SNPในRPREน้ำNCMNVPมีVSTAอะไรPNTRอยู่XVAE"
- },
- "metadata": {},
- "execution_count": 6
- }
+ "text/plain": [
+ "TreeLayout(Tree('S', [Tree('NP', [('ใน', 'RPRE'), ('น้ำ', 'NCMN')]), Tree('VP', [('มี', 'VSTA'), ('อะไร', 'PNTR'), ('อยู่', 'XVAE')])]))"
]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "svgling.draw_tree(conlltags2tree(test(\"ในน้ำมีอะไรอยู่\")))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 188
},
+ "id": "uu4KZ4OIbqy5",
+ "outputId": "c49b5cd2-680f-4a44-afe7-8c80368bffa8"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 188
- },
- "id": "uu4KZ4OIbqy5",
- "outputId": "c49b5cd2-680f-4a44-afe7-8c80368bffa8"
- },
- "source": [
- "svgling.draw_tree(conlltags2tree(test(\"ทำไมเขารักคุณ\")))"
+ "data": {
+ "image/svg+xml": [
+ "SNPทำไมNCMNเขาPPRSVPรักVACTคุณPPRS"
],
- "execution_count": 7,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "TreeLayout(Tree('S', [Tree('NP', [('ทำไม', 'NCMN'), ('เขา', 'PPRS')]), Tree('VP', [('รัก', 'VACT'), ('คุณ', 'PPRS')])]))"
- ],
- "image/svg+xml": "SNPทำไมNCMNเขาPPRSVPรักVACTคุณPPRS"
- },
- "metadata": {},
- "execution_count": 7
- }
+ "text/plain": [
+ "TreeLayout(Tree('S', [Tree('NP', [('ทำไม', 'NCMN'), ('เขา', 'PPRS')]), Tree('VP', [('รัก', 'VACT'), ('คุณ', 'PPRS')])]))"
]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "svgling.draw_tree(conlltags2tree(test(\"ทำไมเขารักคุณ\")))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 188
},
+ "id": "xAsZ9PkvbxrG",
+ "outputId": "1d8c7932-ecf1-4671-a9f7-b2263e3dd80a"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "id": "xAsZ9PkvbxrG",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 188
- },
- "outputId": "1d8c7932-ecf1-4671-a9f7-b2263e3dd80a"
- },
- "source": [
- "svgling.draw_tree(conlltags2tree(test(\"คนอะไรอยู่หลังต้นไม้\")))"
+ "data": {
+ "image/svg+xml": [
+ "SNPคนNCMNอะไรPNTRVPอยู่VSTAหลังRPREต้นไม้NCMN"
],
- "execution_count": 8,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "TreeLayout(Tree('S', [Tree('NP', [('คน', 'NCMN'), ('อะไร', 'PNTR')]), Tree('VP', [('อยู่', 'VSTA'), ('หลัง', 'RPRE'), ('ต้นไม้', 'NCMN')])]))"
- ],
- "image/svg+xml": "SNPคนNCMNอะไรPNTRVPอยู่VSTAหลังRPREต้นไม้NCMN"
- },
- "metadata": {},
- "execution_count": 8
- }
+ "text/plain": [
+ "TreeLayout(Tree('S', [Tree('NP', [('คน', 'NCMN'), ('อะไร', 'PNTR')]), Tree('VP', [('อยู่', 'VSTA'), ('หลัง', 'RPRE'), ('ต้นไม้', 'NCMN')])]))"
]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "SP3ZlCeQJWpq"
- },
- "source": [],
- "execution_count": 8,
- "outputs": []
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ]
-}
\ No newline at end of file
+ ],
+ "source": [
+ "svgling.draw_tree(conlltags2tree(test(\"คนอะไรอยู่หลังต้นไม้\")))"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}