Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add get_txt function #2639

Merged
merged 2 commits into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 4 additions & 14 deletions deepdoc/parser/txt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from deepdoc.parser.utils import get_txt
from rag.nlp import num_tokens_from_string

from rag.nlp import find_codec,num_tokens_from_string
import re

class RAGFlowTxtParser:
def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"):
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
txt = get_txt(fnm, binary)
return self.parser_txt(txt, chunk_token_num, delimiter)

@classmethod
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
if type(txt) != str:
if not isinstance(txt, str):
raise TypeError("txt type should be str!")
cks = [""]
tk_nums = [0]
Expand Down
29 changes: 29 additions & 0 deletions deepdoc/parser/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from rag.nlp import find_codec


def get_txt(fnm: str, binary=None) -> str:
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r") as f:
while True:
line = f.readline()
if not line:
break
txt += line
return txt
13 changes: 2 additions & 11 deletions rag/app/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import re
from io import BytesIO

from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
tokenize_chunks, find_codec
Expand Down Expand Up @@ -88,17 +89,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [(l, "") for l in sections if l]
remove_contents_table(sections, eng=is_english(
Expand Down
13 changes: 2 additions & 11 deletions rag/app/laws.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from docx import Document

from api.db import ParserType
from deepdoc.parser.utils import get_txt
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
from rag.nlp import rag_tokenizer
Expand Down Expand Up @@ -165,17 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
txt = get_txt(filename, binary)
sections = txt.split("\n")
sections = [l for l in sections if l]
callback(0.8, "Finish parsing.")
Expand Down
14 changes: 6 additions & 8 deletions rag/app/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ def __call__(self, filename, binary=None):
return sections, tbls



def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Expand All @@ -190,7 +189,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
pdf_parser = None
sections = []
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = Docx()(filename, binary)
Expand Down Expand Up @@ -222,13 +220,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
if parser_config.get("html4excel"):
sections = [(l, "") for l in excel_parser.html(binary, 12) if l]
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
else:
sections = [(l, "") for l in excel_parser(binary) if l]
sections = [(_, "") for _ in excel_parser(binary) if _]

elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = TxtParser()(filename,binary,
sections = TxtParser()(filename, binary,
parser_config.get("chunk_token_num", 128),
parser_config.get("delimiter", "\n!?;。;!?"))
callback(0.8, "Finish parsing.")
Expand All @@ -242,21 +240,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [(l, "") for l in sections if l]
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")

elif re.search(r"\.json$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary)
sections = [(l, "") for l in sections if l]
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")

elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [(l, "") for l in sections if l]
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")

else:
Expand Down
14 changes: 3 additions & 11 deletions rag/app/one.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from tika import parser
from io import BytesIO
import re

from deepdoc.parser.utils import get_txt
from rag.app import laws
from rag.nlp import rag_tokenizer, tokenize, find_codec
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
Expand Down Expand Up @@ -82,17 +84,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
txt = get_txt(filename, binary)
sections = txt.split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
Expand Down
26 changes: 4 additions & 22 deletions rag/app/qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from timeit import default_timer as timer
from nltk import word_tokenize
from openpyxl import load_workbook

from deepdoc.parser.utils import get_txt
from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
from rag.settings import cron_logger
Expand Down Expand Up @@ -305,17 +307,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
return res
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
txt = get_txt(filename, binary)
lines = txt.split("\n")
comma, tab = 0, 0
for l in lines:
Expand Down Expand Up @@ -358,17 +350,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
return res
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
txt = get_txt(filename, binary)
lines = txt.split("\n")
last_question, last_answer = "", ""
question_stack, level_stack = [], []
Expand Down
13 changes: 2 additions & 11 deletions rag/app/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from dateutil.parser import parse as datetime_parse

from api.db.services.knowledgebase_service import KnowledgebaseService
from deepdoc.parser.utils import get_text
from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
from deepdoc.parser import ExcelParser

Expand Down Expand Up @@ -146,17 +147,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
callback=callback)
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
txt = get_text(filename, binary)
lines = txt.split("\n")
fails = []
headers = lines[0].split(kwargs.get("delimiter", "\t"))
Expand Down