Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove clause_tokenize #1024

Merged
merged 1 commit into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@ The :mod:`pythainlp.tokenize` module contains a comprehensive set of functions a
Modules
-------

.. autofunction:: clause_tokenize
:noindex:

Tokenizes text into clauses. This function allows you to split text into meaningful sections, making it useful for more advanced text processing tasks.

.. autofunction:: sent_tokenize
:noindex:
Expand Down
2 changes: 0 additions & 2 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"THAI2FIT_TOKENIZER",
"Tokenizer",
"Trie",
"clause_tokenize",
"paragraph_tokenize",
"sent_tokenize",
"subword_tokenize",
Expand All @@ -32,7 +31,6 @@

from pythainlp.tokenize.core import (
Tokenizer,
clause_tokenize,
paragraph_tokenize,
sent_tokenize,
subword_tokenize,
Expand Down
37 changes: 0 additions & 37 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,43 +25,6 @@
from pythainlp.util.trie import Trie, dict_trie


def clause_tokenize(doc: List[str]) -> List[List[str]]:
"""
Clause tokenizer. (or Clause segmentation)
Tokenizes running word list into list of clauses (list of strings).
Split by CRF trained on Blackboard Treebank.

:param str doc: word list to be clause tokenized
:return: list of clauses
:rtype: list[list[str]]
:Example:
::

from pythainlp.tokenize import clause_tokenize

clause_tokenize(
[
"ฉัน",
"นอน",
"และ",
"คุณ",
"เล่น",
"มือถือ",
"ส่วน",
"น้อง",
"เขียน",
"โปรแกรม",
]
)
# [['ฉัน', 'นอน'],
# ['และ', 'คุณ', 'เล่น', 'มือถือ'],
# ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
"""
from pythainlp.tokenize.crfcls import segment

return segment(doc)


def word_detokenize(
segments: Union[List[List[str]], List[str]], output: str = "str"
) -> Union[List[str], str]:
Expand Down
77 changes: 0 additions & 77 deletions pythainlp/tokenize/crfcls.py

This file was deleted.

7 changes: 0 additions & 7 deletions tests/extra/testx_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
tltk,
word_tokenize,
)
from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize

from ..core.test_tokenize import (
SENT_1,
Expand All @@ -31,12 +30,6 @@
)


class ClauseTokenizeTestCase(unittest.TestCase):
def test_clause_tokenize(self):
self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"]))
self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list)


class DetokenizeTestCase(unittest.TestCase):
def test_numeric_data_format(self):
engines = ["attacut", "deepcut", "sefr_cut"]
Expand Down
Loading