From fb89a7e55f3ac54328df6a3aed7101152299f1f3 Mon Sep 17 00:00:00 2001 From: chrysanthemum-boy <69421435+chrysanthemum-boy@users.noreply.github.com> Date: Tue, 23 Apr 2024 15:31:43 +0800 Subject: [PATCH] Add `.doc` file parser. (#497) ### What problem does this PR solve? Add `.doc` file parser, using tika. ``` pip install tika ``` ``` from tika import parser from io import BytesIO def extract_text_from_doc_bytes(doc_bytes): file_like_object = BytesIO(doc_bytes) parsed = parser.from_buffer(file_like_object) return parsed["content"] ``` ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: chrysanthemum-boy --- api/utils/file_utils.py | 2 +- rag/app/book.py | 13 ++++++++++++- rag/app/laws.py | 12 +++++++++++- rag/app/naive.py | 11 ++++++++++- rag/app/one.py | 12 +++++++++++- requirements.txt | 3 ++- 6 files changed, 47 insertions(+), 6 deletions(-) diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index e5979b6114..a5af89ca9c 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -147,7 +147,7 @@ def filename_type(filename): return FileType.PDF.value if re.match( - r".*\.(docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): + r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): return FileType.DOC.value if re.match( diff --git a/rag/app/book.py b/rag/app/book.py index 3c46b68a4a..613c1f2cae 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -11,6 +11,7 @@ # limitations under the License. # import copy +from tika import parser import re from io import BytesIO @@ -103,9 +104,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, random_choices([t for t, _ in sections], k=200))) callback(0.8, "Finish parsing.") + elif re.search(r"\.doc$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + binary = BytesIO(binary) + doc_parsed = parser.from_buffer(binary) + sections = doc_parsed['content'].split('\n') + sections = [(l, "") for l in sections if l] + remove_contents_table(sections, eng=is_english( + random_choices([t for t, _ in sections], k=200))) + callback(0.8, "Finish parsing.") + else: raise NotImplementedError( - "file type not supported yet(docx, pdf, txt supported)") + "file type not supported yet(doc, docx, pdf, txt supported)") make_colon_as_title(sections) bull = bullets_category( diff --git a/rag/app/laws.py b/rag/app/laws.py index acb96692ff..7242b893ce 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -11,6 +11,7 @@ # limitations under the License. # import copy +from tika import parser import re from io import BytesIO from docx import Document @@ -123,9 +124,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = txt.split("\n") sections = [l for l in sections if l] callback(0.8, "Finish parsing.") + + elif re.search(r"\.doc$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + binary = BytesIO(binary) + doc_parsed = parser.from_buffer(binary) + sections = doc_parsed['content'].split('\n') + sections = [l for l in sections if l] + callback(0.8, "Finish parsing.") + else: raise NotImplementedError( - "file type not supported yet(docx, pdf, txt supported)") + "file type not supported yet(doc, docx, pdf, txt supported)") # is it English eng = lang.lower() == "english" # is_english(sections) diff --git a/rag/app/naive.py b/rag/app/naive.py index 82618b1560..cd77cdc485 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from tika import parser from io import BytesIO from docx import Document import re @@ -154,9 +155,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = [(l, "") for l in sections if l] callback(0.8, "Finish parsing.") + elif re.search(r"\.doc$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + binary = BytesIO(binary) + doc_parsed = parser.from_buffer(binary) + sections = doc_parsed['content'].split('\n') + sections = [(l, "") for l in sections if l] + callback(0.8, "Finish parsing.") + else: raise NotImplementedError( - "file type not supported yet(docx, pdf, txt supported)") + "file type not supported yet(doc, docx, pdf, txt supported)") chunks = naive_merge( sections, parser_config.get( diff --git a/rag/app/one.py b/rag/app/one.py index 430958d25e..dfcc44f576 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -10,6 +10,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from tika import parser +from io import BytesIO import re from rag.app import laws from rag.nlp import huqie, tokenize, find_codec @@ -95,9 +97,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = [s for s in sections if s] callback(0.8, "Finish parsing.") + elif re.search(r"\.doc$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + binary = BytesIO(binary) + doc_parsed = parser.from_buffer(binary) + sections = doc_parsed['content'].split('\n') + sections = [l for l in sections if l] + callback(0.8, "Finish parsing.") + else: raise NotImplementedError( - "file type not supported yet(docx, pdf, txt supported)") + "file type not supported yet(doc, docx, pdf, txt supported)") doc = { "docnm_kwd": filename, diff --git a/requirements.txt b/requirements.txt index f9ca516fe6..419e2f6c55 100644 --- a/requirements.txt +++ b/requirements.txt @@ -116,6 +116,7 @@ sniffio==1.3.1 StrEnum==0.4.15 sympy==1.12 threadpoolctl==3.3.0 +tika==2.6.0 tiktoken==0.6.0 tokenizers==0.15.2 torch==2.2.1 @@ -133,4 +134,4 @@ xxhash==3.4.1 yarl==1.9.4 zhipuai==2.0.1 BCEmbedding -loguru==0.7.2 +loguru==0.7.2 \ No newline at end of file