From e219618eac9fa3f3c5f701f9bb24dca6e0181d2a Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 18 Jun 2024 09:49:30 +0800 Subject: [PATCH] fix too long query exception --- deepdoc/parser/docx_parser.py | 21 +++++++++++++-------- rag/app/qa.py | 7 +++---- rag/nlp/query.py | 1 + rag/nlp/search.py | 4 ++-- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index 8e13e25604..6bce3d622f 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -113,19 +113,24 @@ def blockType(b): def __call__(self, fnm, from_page=0, to_page=100000): self.doc = Document(fnm) if isinstance( fnm, str) else Document(BytesIO(fnm)) - pn = 0 - secs = [] + pn = 0 # parsed page + secs = [] # parsed contents for p in self.doc.paragraphs: if pn > to_page: break - if from_page <= pn < to_page and p.text.strip(): - secs.append((p.text, p.style.name)) + + runs_within_single_paragraph = [] # save runs within the range of pages for run in p.runs: - if 'lastRenderedPageBreak' in run._element.xml: - pn += 1 - continue - if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: + if pn > to_page: + break + if from_page <= pn < to_page and p.text.strip(): + runs_within_single_paragraph.append(run.text) # append run.text first + + # wrap page break checker into a static method + if RAGFlowDocxParser.has_page_break(run._element.xml): pn += 1 + secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph + tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] return secs, tbls diff --git a/rag/app/qa.py b/rag/app/qa.py index 5b81ec522b..91954000c3 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng): d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) return d + def mdQuestionLevel(s): match = re.match(r'#*', s) return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) @@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): break txt += l lines = txt.split("\n") - comma, tab = 0, 0 last_question, last_answer = "", "" question_stack, level_stack = [], [] code_block = False @@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): last_answer = f'{last_answer}\n{l}' else: # is a question if last_answer: - sum_question = ('\n').join(question_stack) + sum_question = '\n'.join(question_stack) if sum_question: res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) last_answer = '' @@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): question_stack.append(question) level_stack.append(question_level) if last_answer: - sum_question = ('\n').join(question_stack) + sum_question = '\n'.join(question_stack) if sum_question: res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) return res - raise NotImplementedError( "Excel, csv(txt), pdf and markdown format files are supported.") diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 7485f19529..40f1d2bf62 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -110,6 +110,7 @@ def need_fine_grained_tokenize(tk): sm = [] keywords.append(re.sub(r"[ \\\"']+", "", tk)) + if len(keywords) >= 12: break tk_syns = self.syn.lookup(tk) tk = EsQueryer.subSpecialChar(tk) diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 43483237e4..7720330bef 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -98,7 +98,7 @@ def add_filters(bqry): if not qst: if not req.get("sort"): s = s.sort( - {"create_time": {"order": "desc", "unmapped_type": "date"}}, + #{"create_time": {"order": "desc", "unmapped_type": "date"}}, {"create_timestamp_flt": { "order": "desc", "unmapped_type": "float"}} ) @@ -108,7 +108,7 @@ def add_filters(bqry): "mode": "avg", "numeric_type": "double"}}, {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}}, - {"create_time": {"order": "desc", "unmapped_type": "date"}}, + #{"create_time": {"order": "desc", "unmapped_type": "date"}}, {"create_timestamp_flt": { "order": "desc", "unmapped_type": "float"}} )