From d2cfa0d833528eca6f106340d00f81d820cef6e1 Mon Sep 17 00:00:00 2001 From: KevinHuSh Date: Mon, 22 Apr 2024 15:46:09 +0800 Subject: [PATCH] remove doc from supported processing types (#488) ### What problem does this PR solve? #474 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/app/book.py | 2 +- rag/app/laws.py | 2 +- rag/app/naive.py | 2 +- rag/app/one.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rag/app/book.py b/rag/app/book.py index 294f0d7cb2..a76513e2b6 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -67,7 +67,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) pdf_parser = None sections, tbls = [], [] - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") doc_parser = DocxParser() # TODO: table of contents need to be removed diff --git a/rag/app/laws.py b/rag/app/laws.py index 947e4dc408..9b77b4fb70 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -93,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) pdf_parser = None sections = [] - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") for txt in Docx()(filename, binary): sections.append(txt) diff --git a/rag/app/naive.py b/rag/app/naive.py index 62c1df023f..0fcbd9fad7 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -119,7 +119,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res = [] pdf_parser = None sections = [] - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections, tbls = Docx()(filename, binary) res = tokenize_table(tbls, doc, eng) diff --git a/rag/app/one.py b/rag/app/one.py index e78c7e97ee..c56f121403 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -60,7 +60,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, eng = lang.lower() == "english" # is_english(cks) - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections = [txt for txt in laws.Docx()(filename, binary) if txt] callback(0.8, "Finish parsing.")