infiniflow · KevinHuSh · Jun 18, 2024 · Jun 18, 2024
diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py
@@ -113,19 +113,24 @@ def blockType(b):
     def __call__(self, fnm, from_page=0, to_page=100000):
         self.doc = Document(fnm) if isinstance(
             fnm, str) else Document(BytesIO(fnm))
-        pn = 0
-        secs = []
+        pn = 0 # parsed page
+        secs = [] # parsed contents
         for p in self.doc.paragraphs:
             if pn > to_page:
                 break
-            if from_page <= pn < to_page and p.text.strip():
-                secs.append((p.text, p.style.name))
+
+            runs_within_single_paragraph = [] # save runs within the range of pages
             for run in p.runs:
-                if 'lastRenderedPageBreak' in run._element.xml:
-                    pn += 1
-                    continue
-                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                if pn > to_page:
+                    break
+                if from_page <= pn < to_page and p.text.strip():
+                    runs_within_single_paragraph.append(run.text) # append run.text first
+
+                # wrap page break checker into a static method
+                if RAGFlowDocxParser.has_page_break(run._element.xml):
                     pn += 1
 
+            secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
+
         tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
         return secs, tbls
diff --git a/rag/app/qa.py b/rag/app/qa.py
@@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng):
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
     return d
 
+
 def mdQuestionLevel(s):
     match = re.match(r'#*', s)
     return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
@@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                         break
                     txt += l
         lines = txt.split("\n")
-        comma, tab = 0, 0
         last_question, last_answer = "", ""
         question_stack, level_stack = [], []
         code_block = False
@@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                 last_answer = f'{last_answer}\n{l}'
             else:   # is a question
                 if last_answer:
-                    sum_question = ('\n').join(question_stack)
+                    sum_question = '\n'.join(question_stack)
                     if sum_question:
                         res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
                     last_answer = ''
@@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                 question_stack.append(question)
                 level_stack.append(question_level)
         if last_answer:
-            sum_question = ('\n').join(question_stack)
+            sum_question = '\n'.join(question_stack)
             if sum_question:
                 res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
         return res
 
-
     raise NotImplementedError(
         "Excel, csv(txt), pdf and markdown format files are supported.")
 

diff --git a/rag/nlp/query.py b/rag/nlp/query.py
@@ -110,6 +110,7 @@ def need_fine_grained_tokenize(tk):
                     sm = []
 
                 keywords.append(re.sub(r"[ \\\"']+", "", tk))
+                if len(keywords) >= 12: break
 
                 tk_syns = self.syn.lookup(tk)
                 tk = EsQueryer.subSpecialChar(tk)

diff --git a/rag/nlp/search.py b/rag/nlp/search.py
@@ -98,7 +98,7 @@ def add_filters(bqry):
         if not qst:
             if not req.get("sort"):
                 s = s.sort(
-                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
+                    #{"create_time": {"order": "desc", "unmapped_type": "date"}},
                     {"create_timestamp_flt": {
                         "order": "desc", "unmapped_type": "float"}}
                 )
@@ -108,7 +108,7 @@ def add_filters(bqry):
                                       "mode": "avg", "numeric_type": "double"}},
                     {"top_int": {"order": "asc", "unmapped_type": "float",
                                  "mode": "avg", "numeric_type": "double"}},
-                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
+                    #{"create_time": {"order": "desc", "unmapped_type": "date"}},
                     {"create_timestamp_flt": {
                         "order": "desc", "unmapped_type": "float"}}
                 )