Skip to content

Commit

Permalink
Replaced md5 with xxhash64 for chunk id (infiniflow#4009)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Replaced md5 with xxhash64 for chunk id

### Type of change

- [x] Refactoring
  • Loading branch information
yuzhichang authored Dec 12, 2024
1 parent 301f958 commit c8b1a56
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 34 deletions.
6 changes: 2 additions & 4 deletions api/apps/chunk_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from api.db.services.document_service import DocumentService
from api import settings
from api.utils.api_utils import get_json_result
import hashlib
import xxhash
import re


Expand Down Expand Up @@ -208,9 +208,7 @@ def rm():
@validate_request("doc_id", "content_with_weight")
def create():
req = request.json
md5 = hashlib.md5()
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
chunck_id = md5.hexdigest()
chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest()
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
"content_with_weight": req["content_with_weight"]}
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
Expand Down
7 changes: 2 additions & 5 deletions api/apps/sdk/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from api.db import LLMType, ParserType
from api.db.services.llm_service import TenantLLMService
from api import settings
import hashlib
import xxhash
import re
from api.utils.api_utils import token_required
from api.db.db_models import Task
Expand Down Expand Up @@ -984,10 +984,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
return get_error_data_result(
"`questions` is required to be a list"
)
md5 = hashlib.md5()
md5.update((req["content"] + document_id).encode("utf-8"))

chunk_id = md5.hexdigest()
chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest()
d = {
"id": chunk_id,
"content_ltks": rag_tokenizer.tokenize(req["content"]),
Expand Down
7 changes: 2 additions & 5 deletions api/db/services/document_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
#
import logging
import hashlib
import xxhash
import json
import random
import re
Expand Down Expand Up @@ -508,10 +508,7 @@ def dummy(prog=None, msg=""):
for ck in th.result():
d = deepcopy(doc)
d.update(ck)
md5 = hashlib.md5()
md5.update((ck["content_with_weight"] +
str(d["doc_id"])).encode("utf-8"))
d["id"] = md5.hexdigest()
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp()
if not d.get("image"):
Expand Down
20 changes: 8 additions & 12 deletions api/db/services/task_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,13 @@
from rag.nlp import search

def trim_header_by_lines(text: str, max_length) -> str:
if len(text) <= max_length:
len_text = len(text)
if len_text <= max_length:
return text
lines = text.split("\n")
total = 0
idx = len(lines) - 1
for i in range(len(lines)-1, -1, -1):
if total + len(lines[i]) > max_length:
break
idx = i
text2 = "\n".join(lines[idx:])
return text2
for i in range(len_text):
if text[i] == '\n' and len_text - i <= max_length:
return text[i+1:]
return text

class TaskService(CommonService):
model = Task
Expand Down Expand Up @@ -183,7 +179,7 @@ def update_progress(cls, id, info):
if os.environ.get("MACOS"):
if info["progress_msg"]:
task = cls.model.get_by_id(id)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
if "progress" in info:
cls.model.update(progress=info["progress"]).where(
Expand All @@ -194,7 +190,7 @@ def update_progress(cls, id, info):
with DB.lock("update_progress", -1):
if info["progress_msg"]:
task = cls.model.get_by_id(id)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
if "progress" in info:
cls.model.update(progress=info["progress"]).where(
Expand Down
11 changes: 3 additions & 8 deletions rag/svr/task_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import os
from datetime import datetime
import json
import hashlib
import xxhash
import copy
import re
import time
Expand Down Expand Up @@ -226,10 +226,7 @@ def build_chunks(task, progress_callback):
for ck in cks:
d = copy.deepcopy(doc)
d.update(ck)
md5 = hashlib.md5()
md5.update((ck["content_with_weight"] +
str(d["doc_id"])).encode("utf-8"))
d["id"] = md5.hexdigest()
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp()
if not d.get("image"):
Expand Down Expand Up @@ -368,9 +365,7 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
tk_count = 0
for content, vctr in chunks[original_length:]:
d = copy.deepcopy(doc)
md5 = hashlib.md5()
md5.update((content + str(d["doc_id"])).encode("utf-8"))
d["id"] = md5.hexdigest()
d["id"] = xxhash.xxh64((content + str(d["doc_id"])).encode("utf-8")).hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp()
d[vctr_nm] = vctr.tolist()
Expand Down

0 comments on commit c8b1a56

Please sign in to comment.