diff --git a/pyproject.toml b/pyproject.toml index f9b7819..53a54df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "fast-langdetect" -version = "0.1.1" +version = "0.2.0" description = "Quickly detect text language and segment language" authors = [ { name = "sudoskys", email = "coldlando@hotmail.com" }, diff --git a/src/fast_langdetect/__init__.py b/src/fast_langdetect/__init__.py index 09218e4..d55834a 100644 --- a/src/fast_langdetect/__init__.py +++ b/src/fast_langdetect/__init__.py @@ -1,4 +1,3 @@ # -*- coding: utf-8 -*- from .ft_detect import detect, detect_langs, detect_multilingual # noqa: F401 -from .split import parse_sentence # noqa: F401 diff --git a/src/fast_langdetect/split/__init__.py b/src/fast_langdetect/split/__init__.py deleted file mode 100644 index 94371ec..0000000 --- a/src/fast_langdetect/split/__init__.py +++ /dev/null @@ -1,66 +0,0 @@ -# -*- coding: utf-8 -*- -# @Time : 2024/1/17 下午4:23 -# @Author : sudoskys -# @File : __init__.py.py -from typing import List - -from .cut import CutSentence -from ..ft_detect import detect_langs - -CUT = CutSentence() - - -def _merge_cell(result: List[dict]) -> List[dict]: - _merged = [] - _cache = [] - last_lang = None - for _result in result: - if _result["lang"] == last_lang: - _cache.append(_result["text"]) - else: - if _cache: - _length = sum([len(_c) for _c in _cache]) - _merged.append({"text": "".join(_cache), "lang": last_lang, "length": _length}) - _cache = [_result["text"]] - last_lang = _result["lang"] - if _cache: - _length = sum([len(_c) for _c in _cache]) - _merged.append({"text": "".join(_cache), "lang": last_lang, "length": _length}) - return _merged - - -def parse_sentence(sentence: str, - merge_same: bool = True, - cell_limit: int = 150, - filter_space: bool = True, - low_memory: bool = True - ) -> list: - """ - Parse sentence - :param sentence: - :param merge_same: - :param cell_limit: - :param filter_space: - :param low_memory: - :return: - """ - - cut_list = CUT.chinese_sentence_cut(sentence) - _cut_list = [] - for _cut in cut_list: - if len(_cut) > cell_limit: - _text_list = [_cut[i:i + cell_limit] for i in range(0, len(_cut), cell_limit)] - _cut_list.extend(_text_list) - else: - _cut_list.append(_cut) - _result = [] - for _cut in _cut_list: - _lang = detect_langs(_cut, low_memory=low_memory) - if not filter_space: - _result.append({"text": _cut, "lang": _lang, "length": len(_cut)}) - else: - if _lang: - _result.append({"text": _cut, "lang": _lang, "length": len(_cut)}) - if merge_same: - _result = _merge_cell(_result) - return _result diff --git a/src/fast_langdetect/split/cut.py b/src/fast_langdetect/split/cut.py deleted file mode 100644 index e6cea7d..0000000 --- a/src/fast_langdetect/split/cut.py +++ /dev/null @@ -1,59 +0,0 @@ -# -*- coding: utf-8 -*- -# @Time : 2024/1/17 下午8:27 -# @Author : sudoskys -# @File : cut.py -# @Software: PyCharm -import re - - -class CutSentence(object): - @staticmethod - def english_sentence_cut(text) -> list: - list_ = list() - for s_str in text.split('.'): - if '?' in s_str: - list_.extend(s_str.split('?')) - elif '!' in s_str: - list_.extend(s_str.split('!')) - else: - list_.append(s_str) - return list_ - - @staticmethod - def chinese_sentence_cut(text) -> list: - """ - 中文断句 - """ - text = re.sub('([^\n])([.!?。!?]+)(?!\d|[a-zA-Z]|[^\s\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ff\uac00-\ud7a3])', - r'\1\2\n', text) # 根据句子末尾的标点符号进行切分 - text = re.sub('([。::!?\?])([^’”])', r'\1\n\2', text) # 根据中文断句符号进行切分 - text = re.sub('(\.{6})([^’”])', r'\1\n\2', text) # 普通断句符号且后面没有引号 - text = re.sub('(\…{2})([^’”])', r'\1\n\2', text) # 英文省略号且后面没有引号 - text = re.sub('([.。!?\?\.{6}\…{2}][’”])([^’”])', r'\1\n\2', text) # 中文省略号且后面没有引号 - text = re.sub('(\. )([^a-zA-Z\d])', r'\1\n\2', text) # 根据英文断句号+空格进行切分 - text = re.sub('\n\n+', '\n', text) # 删除多余的换行符 - return text.split("\n") # 断句号+引号且后面没有引号 - - def cut_chinese_sentence(self, text): - """ - 中文断句 - """ - p = re.compile("“.*?”") - listr = [] - index = 0 - for i in p.finditer(text): - temp = '' - start = i.start() - end = i.end() - for j in range(index, start): - temp += text[j] - if temp != '': - temp_list = self.chinese_sentence_cut(temp) - listr += temp_list - temp = '' - for k in range(start, end): - temp += text[k] - if temp != ' ': - listr.append(temp) - index = end - return listr