From 789eeefe079b1ac591e26f1010cdd4b3f603674e Mon Sep 17 00:00:00 2001 From: yaoqiankun <410728991@qq.com> Date: Sat, 28 Sep 2024 11:55:57 +0800 Subject: [PATCH 1/2] Add get_txt function to reduce duplicate code --- deepdoc/parser/txt_parser.py | 18 ++++-------------- deepdoc/parser/utils.py | 16 ++++++++++++++++ rag/app/book.py | 13 ++----------- rag/app/laws.py | 13 ++----------- rag/app/naive.py | 14 ++++++-------- rag/app/one.py | 14 +++----------- rag/app/qa.py | 26 ++++---------------------- rag/app/table.py | 13 ++----------- 8 files changed, 39 insertions(+), 88 deletions(-) create mode 100644 deepdoc/parser/utils.py diff --git a/deepdoc/parser/txt_parser.py b/deepdoc/parser/txt_parser.py index 9d723fb99f7..2d5dd9e8cf5 100644 --- a/deepdoc/parser/txt_parser.py +++ b/deepdoc/parser/txt_parser.py @@ -10,28 +10,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from deepdoc.parser.utils import get_txt +from rag.nlp import num_tokens_from_string -from rag.nlp import find_codec,num_tokens_from_string -import re class RAGFlowTxtParser: def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"): - txt = "" - if binary: - encoding = find_codec(binary) - txt = binary.decode(encoding, errors="ignore") - else: - with open(fnm, "r") as f: - while True: - l = f.readline() - if not l: - break - txt += l + txt = get_txt(fnm, binary) return self.parser_txt(txt, chunk_token_num, delimiter) @classmethod def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"): - if type(txt) != str: + if not isinstance(txt, str): raise TypeError("txt type should be str!") cks = [""] tk_nums = [0] diff --git a/deepdoc/parser/utils.py b/deepdoc/parser/utils.py new file mode 100644 index 00000000000..2aa46d91a8c --- /dev/null +++ b/deepdoc/parser/utils.py @@ -0,0 +1,16 @@ +from rag.nlp import find_codec + + +def get_txt(fnm: str, binary=None) -> str: + txt = "" + if binary: + encoding = find_codec(binary) + txt = binary.decode(encoding, errors="ignore") + else: + with open(fnm, "r") as f: + while True: + line = f.readline() + if not line: + break + txt += line + return txt diff --git a/rag/app/book.py b/rag/app/book.py index e165070b488..da37b115920 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -15,6 +15,7 @@ import re from io import BytesIO +from deepdoc.parser.utils import get_text from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ tokenize_chunks, find_codec @@ -88,17 +89,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - txt = "" - if binary: - encoding = find_codec(binary) - txt = binary.decode(encoding, errors="ignore") - else: - with open(filename, "r") as f: - while True: - l = f.readline() - if not l: - break - txt += l + txt = get_text(filename, binary) sections = txt.split("\n") sections = [(l, "") for l in sections if l] remove_contents_table(sections, eng=is_english( diff --git a/rag/app/laws.py b/rag/app/laws.py index 9d6d5c73c29..90a2cddf433 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -17,6 +17,7 @@ from docx import Document from api.db import ParserType +from deepdoc.parser.utils import get_txt from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level from rag.nlp import rag_tokenizer @@ -165,17 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - txt = "" - if binary: - encoding = find_codec(binary) - txt = binary.decode(encoding, errors="ignore") - else: - with open(filename, "r") as f: - while True: - l = f.readline() - if not l: - break - txt += l + txt = get_txt(filename, binary) sections = txt.split("\n") sections = [l for l in sections if l] callback(0.8, "Finish parsing.") diff --git a/rag/app/naive.py b/rag/app/naive.py index 101adca4357..c5a7f0ba846 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -169,7 +169,6 @@ def __call__(self, filename, binary=None): return sections, tbls - def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ @@ -190,7 +189,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] pdf_parser = None - sections = [] if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections, tbls = Docx()(filename, binary) @@ -222,13 +220,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.1, "Start to parse.") excel_parser = ExcelParser() if parser_config.get("html4excel"): - sections = [(l, "") for l in excel_parser.html(binary, 12) if l] + sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] else: - sections = [(l, "") for l in excel_parser(binary) if l] + sections = [(_, "") for _ in excel_parser(binary) if _] elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - sections = TxtParser()(filename,binary, + sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?")) callback(0.8, "Finish parsing.") @@ -242,13 +240,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections = HtmlParser()(filename, binary) - sections = [(l, "") for l in sections if l] + sections = [(_, "") for _ in sections if _] callback(0.8, "Finish parsing.") elif re.search(r"\.json$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary) - sections = [(l, "") for l in sections if l] + sections = [(_, "") for _ in sections if _] callback(0.8, "Finish parsing.") elif re.search(r"\.doc$", filename, re.IGNORECASE): @@ -256,7 +254,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, binary = BytesIO(binary) doc_parsed = parser.from_buffer(binary) sections = doc_parsed['content'].split('\n') - sections = [(l, "") for l in sections if l] + sections = [(_, "") for _ in sections if _] callback(0.8, "Finish parsing.") else: diff --git a/rag/app/one.py b/rag/app/one.py index fe648c71b04..65166381b81 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -13,6 +13,8 @@ from tika import parser from io import BytesIO import re + +from deepdoc.parser.utils import get_txt from rag.app import laws from rag.nlp import rag_tokenizer, tokenize, find_codec from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser @@ -82,17 +84,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - txt = "" - if binary: - encoding = find_codec(binary) - txt = binary.decode(encoding, errors="ignore") - else: - with open(filename, "r") as f: - while True: - l = f.readline() - if not l: - break - txt += l + txt = get_txt(filename, binary) sections = txt.split("\n") sections = [s for s in sections if s] callback(0.8, "Finish parsing.") diff --git a/rag/app/qa.py b/rag/app/qa.py index 38c6392ae9c..4b8027f3d64 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -16,6 +16,8 @@ from timeit import default_timer as timer from nltk import word_tokenize from openpyxl import load_workbook + +from deepdoc.parser.utils import get_txt from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level from rag.nlp import rag_tokenizer, tokenize_table, concat_img from rag.settings import cron_logger @@ -305,17 +307,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): return res elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - txt = "" - if binary: - encoding = find_codec(binary) - txt = binary.decode(encoding, errors="ignore") - else: - with open(filename, "r") as f: - while True: - l = f.readline() - if not l: - break - txt += l + txt = get_txt(filename, binary) lines = txt.split("\n") comma, tab = 0, 0 for l in lines: @@ -358,17 +350,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): return res elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - txt = "" - if binary: - encoding = find_codec(binary) - txt = binary.decode(encoding, errors="ignore") - else: - with open(filename, "r") as f: - while True: - l = f.readline() - if not l: - break - txt += l + txt = get_txt(filename, binary) lines = txt.split("\n") last_question, last_answer = "", "" question_stack, level_stack = [], [] diff --git a/rag/app/table.py b/rag/app/table.py index 2195b391a1d..e55a775e6d8 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -20,6 +20,7 @@ from dateutil.parser import parse as datetime_parse from api.db.services.knowledgebase_service import KnowledgebaseService +from deepdoc.parser.utils import get_text from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec from deepdoc.parser import ExcelParser @@ -146,17 +147,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, callback=callback) elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - txt = "" - if binary: - encoding = find_codec(binary) - txt = binary.decode(encoding, errors="ignore") - else: - with open(filename, "r") as f: - while True: - l = f.readline() - if not l: - break - txt += l + txt = get_text(filename, binary) lines = txt.split("\n") fails = [] headers = lines[0].split(kwargs.get("delimiter", "\t")) From af6df0e1caf110763f3f8b4dc8817666db21a898 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Sun, 29 Sep 2024 10:23:52 +0800 Subject: [PATCH 2/2] Update deepdoc/parser/utils.py --- deepdoc/parser/utils.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/deepdoc/parser/utils.py b/deepdoc/parser/utils.py index 2aa46d91a8c..79ee6bb6855 100644 --- a/deepdoc/parser/utils.py +++ b/deepdoc/parser/utils.py @@ -1,3 +1,16 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + from rag.nlp import find_codec