diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index e5979b6114..a5af89ca9c 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -147,7 +147,7 @@ def filename_type(filename): return FileType.PDF.value if re.match( - r".*\.(docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): + r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): return FileType.DOC.value if re.match( diff --git a/rag/app/book.py b/rag/app/book.py index a76513e2b6..75ea8eabf8 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -11,6 +11,7 @@ # limitations under the License. # import copy +from tika import parser import re from io import BytesIO @@ -103,9 +104,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, random_choices([t for t, _ in sections], k=200))) callback(0.8, "Finish parsing.") + elif re.search(r"\.doc$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + binary = BytesIO(binary) + doc_parsed = parser.from_buffer(binary) + sections = doc_parsed['content'].split('\n') + sections = [(l, "") for l in sections if l] + remove_contents_table(sections, eng=is_english( + random_choices([t for t, _ in sections], k=200))) + callback(0.8, "Finish parsing.") + else: raise NotImplementedError( - "file type not supported yet(docx, pdf, txt supported)") + "file type not supported yet(doc, docx, pdf, txt supported)") make_colon_as_title(sections) bull = bullets_category( diff --git a/rag/app/laws.py b/rag/app/laws.py index 9b77b4fb70..eb2ae3a61b 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -11,6 +11,7 @@ # limitations under the License. # import copy +from tika import parser import re from io import BytesIO from docx import Document @@ -123,9 +124,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = txt.split("\n") sections = [l for l in sections if l] callback(0.8, "Finish parsing.") + + elif re.search(r"\.doc$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + binary = BytesIO(binary) + doc_parsed = parser.from_buffer(binary) + sections = doc_parsed['content'].split('\n') + sections = [l for l in sections if l] + callback(0.8, "Finish parsing.") + else: raise NotImplementedError( - "file type not supported yet(docx, pdf, txt supported)") + "file type not supported yet(doc, docx, pdf, txt supported)") # is it English eng = lang.lower() == "english" # is_english(sections) diff --git a/rag/app/naive.py b/rag/app/naive.py index 0fcbd9fad7..432531f58f 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from tika import parser from io import BytesIO from docx import Document import re @@ -154,9 +155,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = [(l, "") for l in sections if l] callback(0.8, "Finish parsing.") + elif re.search(r"\.doc$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + binary = BytesIO(binary) + doc_parsed = parser.from_buffer(binary) + sections = doc_parsed['content'].split('\n') + sections = [(l, "") for l in sections if l] + callback(0.8, "Finish parsing.") + else: raise NotImplementedError( - "file type not supported yet(docx, pdf, txt supported)") + "file type not supported yet(doc, docx, pdf, txt supported)") chunks = naive_merge( sections, parser_config.get( diff --git a/rag/app/one.py b/rag/app/one.py index c56f121403..fd2bc2c8e8 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -10,6 +10,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from tika import parser +from io import BytesIO import re from rag.app import laws from rag.nlp import huqie, tokenize, find_codec @@ -95,9 +97,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = [s for s in sections if s] callback(0.8, "Finish parsing.") + elif re.search(r"\.doc$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + binary = BytesIO(binary) + doc_parsed = parser.from_buffer(binary) + sections = doc_parsed['content'].split('\n') + sections = [l for l in sections if l] + callback(0.8, "Finish parsing.") + else: raise NotImplementedError( - "file type not supported yet(docx, pdf, txt supported)") + "file type not supported yet(doc, docx, pdf, txt supported)") doc = { "docnm_kwd": filename, diff --git a/requirements.txt b/requirements.txt index f9ca516fe6..419e2f6c55 100644 --- a/requirements.txt +++ b/requirements.txt @@ -116,6 +116,7 @@ sniffio==1.3.1 StrEnum==0.4.15 sympy==1.12 threadpoolctl==3.3.0 +tika==2.6.0 tiktoken==0.6.0 tokenizers==0.15.2 torch==2.2.1 @@ -133,4 +134,4 @@ xxhash==3.4.1 yarl==1.9.4 zhipuai==2.0.1 BCEmbedding -loguru==0.7.2 +loguru==0.7.2 \ No newline at end of file