diff --git a/rag/app/book.py b/rag/app/book.py index 294f0d7cb27..a76513e2b60 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -67,7 +67,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) pdf_parser = None sections, tbls = [], [] - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") doc_parser = DocxParser() # TODO: table of contents need to be removed diff --git a/rag/app/laws.py b/rag/app/laws.py index 947e4dc408a..9b77b4fb704 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -93,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) pdf_parser = None sections = [] - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") for txt in Docx()(filename, binary): sections.append(txt) diff --git a/rag/app/naive.py b/rag/app/naive.py index 62c1df023f1..0fcbd9fad72 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -119,7 +119,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res = [] pdf_parser = None sections = [] - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections, tbls = Docx()(filename, binary) res = tokenize_table(tbls, doc, eng) diff --git a/rag/app/one.py b/rag/app/one.py index e78c7e97eea..c56f121403e 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -60,7 +60,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, eng = lang.lower() == "english" # is_english(cks) - if re.search(r"\.docx?$", filename, re.IGNORECASE): + if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections = [txt for txt in laws.Docx()(filename, binary) if txt] callback(0.8, "Finish parsing.")