From 5128ef20d9daceff49b4716948366606e3e8c6ad Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 30 Dec 2024 17:46:26 +0800 Subject: [PATCH 1/2] Code refactor. --- agent/component/answer.py | 9 +++++++++ api/apps/canvas_app.py | 4 ++++ api/apps/dialog_app.py | 6 ++---- api/apps/kb_app.py | 3 ++- api/utils/api_utils.py | 4 ++++ graphrag/graph_prompt.py | 8 ++++---- graphrag/utils.py | 2 +- rag/app/laws.py | 8 +++----- rag/app/manual.py | 4 ++-- rag/app/table.py | 2 +- 10 files changed, 32 insertions(+), 18 deletions(-) diff --git a/agent/component/answer.py b/agent/component/answer.py index aedc0dd9793..590e4f20a0c 100644 --- a/agent/component/answer.py +++ b/agent/component/answer.py @@ -16,6 +16,7 @@ import random from abc import ABC from functools import partial +from typing import Tuple, Union import pandas as pd @@ -76,4 +77,12 @@ def stream_output(self): def set_exception(self, e): self.exception = e + def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]: + if allow_partial: return super.output() + + for r, c in self._canvas.history[::-1]: + if r == "user": + return self._param.output_var_name, pd.DataFrame([{"content": c}]) + + self._param.output_var_name, pd.DataFrame([]) diff --git a/api/apps/canvas_app.py b/api/apps/canvas_app.py index bd4ae2b77d0..76426f4173d 100644 --- a/api/apps/canvas_app.py +++ b/api/apps/canvas_app.py @@ -146,12 +146,16 @@ def sse(): canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id}) canvas.history.append(("assistant", final_ans["content"])) + if not canvas.path[-1]: + canvas.path.pop(-1) if final_ans.get("reference"): canvas.reference.append(final_ans["reference"]) cvs.dsl = json.loads(str(canvas)) UserCanvasService.update_by_id(req["id"], cvs.to_dict()) except Exception as e: cvs.dsl = json.loads(str(canvas)) + if not canvas.path[-1]: + canvas.path.pop(-1) UserCanvasService.update_by_id(req["id"], cvs.to_dict()) traceback.print_exc() yield "data:" + json.dumps({"code": 500, "message": str(e), diff --git a/api/apps/dialog_app.py b/api/apps/dialog_app.py index 0599c4da271..822bcd848f3 100644 --- a/api/apps/dialog_app.py +++ b/api/apps/dialog_app.py @@ -103,10 +103,7 @@ def set_dialog(): } if not DialogService.save(**dia): return get_data_error_result(message="Fail to new a dialog!") - e, dia = DialogService.get_by_id(dia["id"]) - if not e: - return get_data_error_result(message="Fail to new a dialog!") - return get_json_result(data=dia.to_json()) + return get_json_result(data=dia) else: del req["dialog_id"] if "kb_names" in req: @@ -117,6 +114,7 @@ def set_dialog(): if not e: return get_data_error_result(message="Fail to update a dialog!") dia = dia.to_dict() + dia.update(req) dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"]) return get_json_result(data=dia) except Exception as e: diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index 14caf7c2291..b120d3cfbb4 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -185,7 +185,8 @@ def rm(): return get_data_error_result( message="Database error (Document removal)!") f2d = File2DocumentService.get_by_document_id(doc.id) - FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) + if f2d: + FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) File2DocumentService.delete_by_document_id(doc.id) FileService.filter_delete( [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name]) diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index f4e9e176782..2dba15bbe53 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -120,6 +120,10 @@ def server_error_response(e): if len(e.args) > 1: return get_json_result( code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1]) + if repr(e).find("index_not_found_exception") >= 0: + return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, + message="No chunk found, please upload file and parse it.") + return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e)) diff --git a/graphrag/graph_prompt.py b/graphrag/graph_prompt.py index 53ce61ad8dc..1d8b39dbaa3 100644 --- a/graphrag/graph_prompt.py +++ b/graphrag/graph_prompt.py @@ -11,20 +11,20 @@ -Steps- 1. Identify all entities. For each identified entity, extract the following information: -- entity_name: Name of the entity, capitalized +- entity_name: Name of the entity, capitalized, in language of 'Text' - entity_type: One of the following types: [{entity_types}] -- entity_description: Comprehensive description of the entity's attributes and activities +- entity_description: Comprehensive description of the entity's attributes and activities in language of 'Text' Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. For each pair of related entities, extract the following information: - source_entity: name of the source entity, as identified in step 1 - target_entity: name of the target entity, as identified in step 1 -- relationship_description: explanation as to why you think the source entity and the target entity are related to each other +- relationship_description: explanation as to why you think the source entity and the target entity are related to each other in language of 'Text' - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) -3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. +3. Return output as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. 4. When finished, output {completion_delimiter} diff --git a/graphrag/utils.py b/graphrag/utils.py index c462d0430c7..37de1154351 100644 --- a/graphrag/utils.py +++ b/graphrag/utils.py @@ -81,7 +81,7 @@ def get_llm_cache(llmnm, txt, history, genconf): return bin -def set_llm_cache(llmnm, txt, v: str, history, genconf): +def set_llm_cache(llmnm, txt, v, history, genconf): hasher = xxhash.xxh64() hasher.update(str(llmnm).encode("utf-8")) hasher.update(str(txt).encode("utf-8")) diff --git a/rag/app/laws.py b/rag/app/laws.py index 1e3dedfdc1b..7a32c827319 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - for txt in Docx()(filename, binary): - sections.append(txt) - callback(0.8, "Finish parsing.") - chunks = sections - return tokenize_chunks(chunks, doc, eng, pdf_parser) + chunks = Docx()(filename, binary) + callback(0.7, "Finish parsing.") + return tokenize_chunks(chunks, doc, eng, None) elif re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() if kwargs.get( diff --git a/rag/app/manual.py b/rag/app/manual.py index 49acac15811..c60df258eca 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = [(t, lvl, [[0] * 5]) for t, lvl in sections] # set pivot using the most frequent type of title, # then merge between 2 pivot - if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1: + if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03: max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) most_level = max(0, max_lvl - 1) levels = [] @@ -256,7 +256,7 @@ def tag(pn, left, right, top, bottom): res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) return res - if re.search(r"\.docx$", filename, re.IGNORECASE): + elif re.search(r"\.docx$", filename, re.IGNORECASE): docx_parser = Docx() ti_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback) diff --git a/rag/app/table.py b/rag/app/table.py index e28b882da98..88fceef787d 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -185,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, "datetime": "_dt", "bool": "_kwd"} for df in dfs: - for n in ["id", "index", "idx"]: + for n in ["id", "_id", "index", "idx"]: if n in df.columns: del df[n] clmns = df.columns.values From 81bf3415546a34c0330713536ed8c76efb1c927f Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 30 Dec 2024 17:49:24 +0800 Subject: [PATCH 2/2] Code format. --- agent/component/answer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/agent/component/answer.py b/agent/component/answer.py index 590e4f20a0c..67dcbc63f7c 100644 --- a/agent/component/answer.py +++ b/agent/component/answer.py @@ -78,7 +78,8 @@ def set_exception(self, e): self.exception = e def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]: - if allow_partial: return super.output() + if allow_partial: + return super.output() for r, c in self._canvas.history[::-1]: if r == "user":