Skip to content

Commit

Permalink
Let 'One' applicable for tables in docx
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinHuSh committed Nov 25, 2024
1 parent 6314d3c commit 838d093
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions rag/app/one.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import re

from deepdoc.parser.utils import get_text
from rag.app import laws
from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser

Expand Down Expand Up @@ -67,7 +67,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = [txt for txt in laws.Docx()(filename, binary) if txt]
sections, tbls = naive.Docx()(filename, binary)
sections = [s for s, _ in sections if s]
for (_, html), _ in tbls:
sections.append(html)
callback(0.8, "Finish parsing.")

elif re.search(r"\.pdf$", filename, re.IGNORECASE):
Expand Down

0 comments on commit 838d093

Please sign in to comment.