From 838d09303d4d57f405f2e6997babe1f550bacb02 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 25 Nov 2024 09:38:41 +0800 Subject: [PATCH] Let 'One' applicable for tables in docx --- rag/app/one.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rag/app/one.py b/rag/app/one.py index 9f24ccb95b..76dc45893d 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -16,7 +16,7 @@ import re from deepdoc.parser.utils import get_text -from rag.app import laws +from rag.app import naive from rag.nlp import rag_tokenizer, tokenize from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser @@ -67,7 +67,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - sections = [txt for txt in laws.Docx()(filename, binary) if txt] + sections, tbls = naive.Docx()(filename, binary) + sections = [s for s, _ in sections if s] + for (_, html), _ in tbls: + sections.append(html) callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE):