From c8b1a564aa0d72ec528c152cdf6b70cf9d14fd80 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Thu, 12 Dec 2024 17:47:39 +0800 Subject: [PATCH] Replaced md5 with xxhash64 for chunk id (#4009) ### What problem does this PR solve? Replaced md5 with xxhash64 for chunk id ### Type of change - [x] Refactoring --- api/apps/chunk_app.py | 6 ++---- api/apps/sdk/doc.py | 7 ++----- api/db/services/document_service.py | 7 ++----- api/db/services/task_service.py | 20 ++++++++------------ rag/svr/task_executor.py | 11 +++-------- 5 files changed, 17 insertions(+), 34 deletions(-) diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 6dc1f9b646..644353ba60 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -31,7 +31,7 @@ from api.db.services.document_service import DocumentService from api import settings from api.utils.api_utils import get_json_result -import hashlib +import xxhash import re @@ -208,9 +208,7 @@ def rm(): @validate_request("doc_id", "content_with_weight") def create(): req = request.json - md5 = hashlib.md5() - md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) - chunck_id = md5.hexdigest() + chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest() d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]), "content_with_weight": req["content_with_weight"]} d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 7fa6458461..29e3d9bc2f 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -22,7 +22,7 @@ from api.db import LLMType, ParserType from api.db.services.llm_service import TenantLLMService from api import settings -import hashlib +import xxhash import re from api.utils.api_utils import token_required from api.db.db_models import Task @@ -984,10 +984,7 @@ def add_chunk(tenant_id, dataset_id, document_id): return get_error_data_result( "`questions` is required to be a list" ) - md5 = hashlib.md5() - md5.update((req["content"] + document_id).encode("utf-8")) - - chunk_id = md5.hexdigest() + chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest() d = { "id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]), diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index cff5db583d..e5b7641efc 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -14,7 +14,7 @@ # limitations under the License. # import logging -import hashlib +import xxhash import json import random import re @@ -508,10 +508,7 @@ def dummy(prog=None, msg=""): for ck in th.result(): d = deepcopy(doc) d.update(ck) - md5 = hashlib.md5() - md5.update((ck["content_with_weight"] + - str(d["doc_id"])).encode("utf-8")) - d["id"] = md5.hexdigest() + d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest() d["create_time"] = str(datetime.now()).replace("T", " ")[:19] d["create_timestamp_flt"] = datetime.now().timestamp() if not d.get("image"): diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index 5c53fe241b..f73afa0bbd 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -35,17 +35,13 @@ from rag.nlp import search def trim_header_by_lines(text: str, max_length) -> str: - if len(text) <= max_length: + len_text = len(text) + if len_text <= max_length: return text - lines = text.split("\n") - total = 0 - idx = len(lines) - 1 - for i in range(len(lines)-1, -1, -1): - if total + len(lines[i]) > max_length: - break - idx = i - text2 = "\n".join(lines[idx:]) - return text2 + for i in range(len_text): + if text[i] == '\n' and len_text - i <= max_length: + return text[i+1:] + return text class TaskService(CommonService): model = Task @@ -183,7 +179,7 @@ def update_progress(cls, id, info): if os.environ.get("MACOS"): if info["progress_msg"]: task = cls.model.get_by_id(id) - progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000) + progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000) cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute() if "progress" in info: cls.model.update(progress=info["progress"]).where( @@ -194,7 +190,7 @@ def update_progress(cls, id, info): with DB.lock("update_progress", -1): if info["progress_msg"]: task = cls.model.get_by_id(id) - progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000) + progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000) cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute() if "progress" in info: cls.model.update(progress=info["progress"]).where( diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 10a2082c6a..da62b7245c 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -27,7 +27,7 @@ import os from datetime import datetime import json -import hashlib +import xxhash import copy import re import time @@ -226,10 +226,7 @@ def build_chunks(task, progress_callback): for ck in cks: d = copy.deepcopy(doc) d.update(ck) - md5 = hashlib.md5() - md5.update((ck["content_with_weight"] + - str(d["doc_id"])).encode("utf-8")) - d["id"] = md5.hexdigest() + d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest() d["create_time"] = str(datetime.now()).replace("T", " ")[:19] d["create_timestamp_flt"] = datetime.now().timestamp() if not d.get("image"): @@ -368,9 +365,7 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None): tk_count = 0 for content, vctr in chunks[original_length:]: d = copy.deepcopy(doc) - md5 = hashlib.md5() - md5.update((content + str(d["doc_id"])).encode("utf-8")) - d["id"] = md5.hexdigest() + d["id"] = xxhash.xxh64((content + str(d["doc_id"])).encode("utf-8")).hexdigest() d["create_time"] = str(datetime.now()).replace("T", " ")[:19] d["create_timestamp_flt"] = datetime.now().timestamp() d[vctr_nm] = vctr.tolist()