From 5128ef20d9daceff49b4716948366606e3e8c6ad Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Mon, 30 Dec 2024 17:46:26 +0800
Subject: [PATCH 1/2] Code refactor.

---
 agent/component/answer.py | 9 +++++++++
 api/apps/canvas_app.py    | 4 ++++
 api/apps/dialog_app.py    | 6 ++----
 api/apps/kb_app.py        | 3 ++-
 api/utils/api_utils.py    | 4 ++++
 graphrag/graph_prompt.py  | 8 ++++----
 graphrag/utils.py         | 2 +-
 rag/app/laws.py           | 8 +++-----
 rag/app/manual.py         | 4 ++--
 rag/app/table.py          | 2 +-
 10 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/agent/component/answer.py b/agent/component/answer.py
index aedc0dd9793..590e4f20a0c 100644
--- a/agent/component/answer.py
+++ b/agent/component/answer.py
@@ -16,6 +16,7 @@
 import random
 from abc import ABC
 from functools import partial
+from typing import Tuple, Union
 
 import pandas as pd
 
@@ -76,4 +77,12 @@ def stream_output(self):
     def set_exception(self, e):
         self.exception = e
 
+    def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]:
+        if allow_partial: return super.output()
+
+        for r, c in self._canvas.history[::-1]:
+            if r == "user":
+                return self._param.output_var_name, pd.DataFrame([{"content": c}])
+
+        self._param.output_var_name, pd.DataFrame([])
 
diff --git a/api/apps/canvas_app.py b/api/apps/canvas_app.py
index bd4ae2b77d0..76426f4173d 100644
--- a/api/apps/canvas_app.py
+++ b/api/apps/canvas_app.py
@@ -146,12 +146,16 @@ def sse():
 
                 canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id})
                 canvas.history.append(("assistant", final_ans["content"]))
+                if not canvas.path[-1]:
+                    canvas.path.pop(-1)
                 if final_ans.get("reference"):
                     canvas.reference.append(final_ans["reference"])
                 cvs.dsl = json.loads(str(canvas))
                 UserCanvasService.update_by_id(req["id"], cvs.to_dict())
             except Exception as e:
                 cvs.dsl = json.loads(str(canvas))
+                if not canvas.path[-1]:
+                    canvas.path.pop(-1)
                 UserCanvasService.update_by_id(req["id"], cvs.to_dict())
                 traceback.print_exc()
                 yield "data:" + json.dumps({"code": 500, "message": str(e),
diff --git a/api/apps/dialog_app.py b/api/apps/dialog_app.py
index 0599c4da271..822bcd848f3 100644
--- a/api/apps/dialog_app.py
+++ b/api/apps/dialog_app.py
@@ -103,10 +103,7 @@ def set_dialog():
             }
             if not DialogService.save(**dia):
                 return get_data_error_result(message="Fail to new a dialog!")
-            e, dia = DialogService.get_by_id(dia["id"])
-            if not e:
-                return get_data_error_result(message="Fail to new a dialog!")
-            return get_json_result(data=dia.to_json())
+            return get_json_result(data=dia)
         else:
             del req["dialog_id"]
             if "kb_names" in req:
@@ -117,6 +114,7 @@ def set_dialog():
             if not e:
                 return get_data_error_result(message="Fail to update a dialog!")
             dia = dia.to_dict()
+            dia.update(req)
             dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"])
             return get_json_result(data=dia)
     except Exception as e:
diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py
index 14caf7c2291..b120d3cfbb4 100644
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
@@ -185,7 +185,8 @@ def rm():
                 return get_data_error_result(
                     message="Database error (Document removal)!")
             f2d = File2DocumentService.get_by_document_id(doc.id)
-            FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
+            if f2d:
+                FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
             File2DocumentService.delete_by_document_id(doc.id)
         FileService.filter_delete(
             [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py
index f4e9e176782..2dba15bbe53 100644
--- a/api/utils/api_utils.py
+++ b/api/utils/api_utils.py
@@ -120,6 +120,10 @@ def server_error_response(e):
     if len(e.args) > 1:
         return get_json_result(
             code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
+    if repr(e).find("index_not_found_exception") >= 0:
+        return get_json_result(code=settings.RetCode.EXCEPTION_ERROR,
+                               message="No chunk found, please upload file and parse it.")
+
     return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
 
 
diff --git a/graphrag/graph_prompt.py b/graphrag/graph_prompt.py
index 53ce61ad8dc..1d8b39dbaa3 100644
--- a/graphrag/graph_prompt.py
+++ b/graphrag/graph_prompt.py
@@ -11,20 +11,20 @@
 
 -Steps-
 1. Identify all entities. For each identified entity, extract the following information:
-- entity_name: Name of the entity, capitalized
+- entity_name: Name of the entity, capitalized, in language of 'Text'
 - entity_type: One of the following types: [{entity_types}]
-- entity_description: Comprehensive description of the entity's attributes and activities
+- entity_description: Comprehensive description of the entity's attributes and activities in language of 'Text'
 Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>
 
 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
 For each pair of related entities, extract the following information:
 - source_entity: name of the source entity, as identified in step 1
 - target_entity: name of the target entity, as identified in step 1
-- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
+- relationship_description: explanation as to why you think the source entity and the target entity are related to each other in language of 'Text'
 - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
  Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)
 
-3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
+3. Return output as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
 
 4. When finished, output {completion_delimiter}
 
diff --git a/graphrag/utils.py b/graphrag/utils.py
index c462d0430c7..37de1154351 100644
--- a/graphrag/utils.py
+++ b/graphrag/utils.py
@@ -81,7 +81,7 @@ def get_llm_cache(llmnm, txt, history, genconf):
     return bin
 
 
-def set_llm_cache(llmnm, txt, v: str, history, genconf):
+def set_llm_cache(llmnm, txt, v, history, genconf):
     hasher = xxhash.xxh64()
     hasher.update(str(llmnm).encode("utf-8"))
     hasher.update(str(txt).encode("utf-8"))
diff --git a/rag/app/laws.py b/rag/app/laws.py
index 1e3dedfdc1b..7a32c827319 100644
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
 
     if re.search(r"\.docx$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
-        for txt in Docx()(filename, binary):
-            sections.append(txt)
-        callback(0.8, "Finish parsing.")
-        chunks = sections
-        return tokenize_chunks(chunks, doc, eng, pdf_parser)
+        chunks = Docx()(filename, binary)
+        callback(0.7, "Finish parsing.")
+        return tokenize_chunks(chunks, doc, eng, None)
 
     elif re.search(r"\.pdf$", filename, re.IGNORECASE):
         pdf_parser = Pdf() if kwargs.get(
diff --git a/rag/app/manual.py b/rag/app/manual.py
index 49acac15811..c60df258eca 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
             sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
         # set pivot using the most frequent type of title,
         # then merge between 2 pivot
-        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
+        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
             max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
             most_level = max(0, max_lvl - 1)
             levels = []
@@ -256,7 +256,7 @@ def tag(pn, left, right, top, bottom):
         res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
         return res
 
-    if re.search(r"\.docx$", filename, re.IGNORECASE):
+    elif re.search(r"\.docx$", filename, re.IGNORECASE):
         docx_parser = Docx()
         ti_list, tbls = docx_parser(filename, binary,
                                     from_page=0, to_page=10000, callback=callback)
diff --git a/rag/app/table.py b/rag/app/table.py
index e28b882da98..88fceef787d 100644
--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -185,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
         "datetime": "_dt",
         "bool": "_kwd"}
     for df in dfs:
-        for n in ["id", "index", "idx"]:
+        for n in ["id", "_id", "index", "idx"]:
             if n in df.columns:
                 del df[n]
         clmns = df.columns.values

From 81bf3415546a34c0330713536ed8c76efb1c927f Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Mon, 30 Dec 2024 17:49:24 +0800
Subject: [PATCH 2/2] Code format.

---
 agent/component/answer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/agent/component/answer.py b/agent/component/answer.py
index 590e4f20a0c..67dcbc63f7c 100644
--- a/agent/component/answer.py
+++ b/agent/component/answer.py
@@ -78,7 +78,8 @@ def set_exception(self, e):
         self.exception = e
 
     def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]:
-        if allow_partial: return super.output()
+        if allow_partial:
+            return super.output()
 
         for r, c in self._canvas.history[::-1]:
             if r == "user":