diff --git a/rag/app/one.py b/rag/app/one.py index 9f24ccb95bb..76dc45893d2 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -16,7 +16,7 @@ import re from deepdoc.parser.utils import get_text -from rag.app import laws +from rag.app import naive from rag.nlp import rag_tokenizer, tokenize from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser @@ -67,7 +67,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - sections = [txt for txt in laws.Docx()(filename, binary) if txt] + sections, tbls = naive.Docx()(filename, binary) + sections = [s for s, _ in sections if s] + for (_, html), _ in tbls: + sections.append(html) callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE):