Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rename some attributes in document sdk #2481

Merged
merged 3 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 44 additions & 37 deletions api/apps/sdk/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def docinfos(tenant_id):
"chunk_num": "chunk_count",
"kb_id": "knowledgebase_id",
"token_num": "token_count",
"parser_id":"parser_method",
}
renamed_doc = {}
for key, value in doc.to_dict().items():
Expand All @@ -125,10 +126,14 @@ def save_doc(tenant_id):
if not e:
return get_data_error_result(retmsg="Document not found!")
#other value can't be changed
if "chunk_num" in req:
if req["chunk_num"] != doc.chunk_num:
if "chunk_count" in req:
if req["chunk_count"] != doc.chunk_num:
return get_data_error_result(
retmsg="Can't change chunk_count.")
if "token_count" in req:
if req["token_count"] != doc.token_num:
return get_data_error_result(
retmsg="Can't change token_count.")
if "progress" in req:
if req['progress'] != doc.progress:
return get_data_error_result(
Expand Down Expand Up @@ -158,9 +163,9 @@ def save_doc(tenant_id):
FileService.update_by_id(file.id, {"name": req["name"]})
except Exception as e:
return server_error_response(e)
if "parser_id" in req:
if "parser_method" in req:
try:
if doc.parser_id.lower() == req["parser_id"].lower():
if doc.parser_id.lower() == req["parser_method"].lower():
if "parser_config" in req:
if req["parser_config"] == doc.parser_config:
return get_json_result(data=True)
Expand All @@ -172,7 +177,7 @@ def save_doc(tenant_id):
return get_data_error_result(retmsg="Not supported yet!")

e = DocumentService.update_by_id(doc.id,
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
{"parser_id": req["parser_method"], "progress": 0, "progress_msg": "",
"run": TaskStatus.UNSTART.value})
if not e:
return get_data_error_result(retmsg="Document not found!")
Expand All @@ -183,7 +188,7 @@ def save_doc(tenant_id):
doc.process_duation * -1)
if not e:
return get_data_error_result(retmsg="Document not found!")
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
tenant_id = DocumentService.get_tenant_id(req["id"])
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Expand Down Expand Up @@ -272,7 +277,7 @@ def rename():

@manager.route("/<document_id>", methods=["GET"])
@token_required
def download_document(dataset_id, document_id,tenant_id):
def download_document(document_id,tenant_id):
try:
# Check whether there is this document
exist, document = DocumentService.get_by_id(document_id)
Expand Down Expand Up @@ -304,7 +309,7 @@ def download_document(dataset_id, document_id,tenant_id):
@manager.route('/dataset/<dataset_id>/documents', methods=['GET'])
@token_required
def list_docs(dataset_id, tenant_id):
kb_id = request.args.get("kb_id")
kb_id = request.args.get("knowledgebase_id")
if not kb_id:
return get_json_result(
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
Expand Down Expand Up @@ -334,6 +339,7 @@ def list_docs(dataset_id, tenant_id):
"chunk_num": "chunk_count",
"kb_id": "knowledgebase_id",
"token_num": "token_count",
"parser_id":"parser_method"
}
renamed_doc = {}
for key, value in doc.items():
Expand All @@ -349,10 +355,10 @@ def list_docs(dataset_id, tenant_id):
@token_required
def rm(tenant_id):
req = request.args
if "doc_id" not in req:
if "document_id" not in req:
return get_data_error_result(
retmsg="doc_id is required")
doc_ids = req["doc_id"]
doc_ids = req["document_id"]
if isinstance(doc_ids, str): doc_ids = [doc_ids]
root_folder = FileService.get_root_folder(tenant_id)
pf_id = root_folder["id"]
Expand Down Expand Up @@ -413,7 +419,7 @@ def show_parsing_status(tenant_id, document_id):
def run(tenant_id):
req = request.json
try:
for id in req["doc_ids"]:
for id in req["document_ids"]:
info = {"run": str(req["run"]), "progress": 0}
if str(req["run"]) == TaskStatus.RUNNING.value:
info["progress_msg"] = ""
Expand Down Expand Up @@ -442,15 +448,15 @@ def run(tenant_id):

@manager.route('/chunk/list', methods=['POST'])
@token_required
@validate_request("doc_id")
@validate_request("document_id")
def list_chunk(tenant_id):
req = request.json
doc_id = req["doc_id"]
doc_id = req["document_id"]
page = int(req.get("page", 1))
size = int(req.get("size", 30))
question = req.get("keywords", "")
try:
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
tenant_id = DocumentService.get_tenant_id(req["document_id"])
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")
e, doc = DocumentService.get_by_id(doc_id)
Expand Down Expand Up @@ -509,38 +515,38 @@ def list_chunk(tenant_id):

@manager.route('/chunk/create', methods=['POST'])
@token_required
@validate_request("doc_id", "content_with_weight")
@validate_request("document_id", "content")
def create(tenant_id):
req = request.json
md5 = hashlib.md5()
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
md5.update((req["content"] + req["document_id"]).encode("utf-8"))

chunk_id = md5.hexdigest()
d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
"content_with_weight": req["content_with_weight"]}
d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
"content_with_weight": req["content"]}
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["important_kwd"] = req.get("important_kwd", [])
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()

try:
e, doc = DocumentService.get_by_id(req["doc_id"])
e, doc = DocumentService.get_by_id(req["document_id"])
if not e:
return get_data_error_result(retmsg="Document not found!")
d["kb_id"] = [doc.kb_id]
d["docnm_kwd"] = doc.name
d["doc_id"] = doc.id

tenant_id = DocumentService.get_tenant_id(req["doc_id"])
tenant_id = DocumentService.get_tenant_id(req["document_id"])
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")

embd_id = DocumentService.get_embd_id(req["doc_id"])
embd_id = DocumentService.get_embd_id(req["document_id"])
embd_mdl = TenantLLMService.model_instance(
tenant_id, LLMType.EMBEDDING.value, embd_id)

v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v, c = embd_mdl.encode([doc.name, req["content"]])
v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
Expand Down Expand Up @@ -568,14 +574,14 @@ def create(tenant_id):

@manager.route('/chunk/rm', methods=['POST'])
@token_required
@validate_request("chunk_ids", "doc_id")
@validate_request("chunk_ids", "document_id")
def rm_chunk(tenant_id):
req = request.json
try:
if not ELASTICSEARCH.deleteByQuery(
Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
return get_data_error_result(retmsg="Index updating failure")
e, doc = DocumentService.get_by_id(req["doc_id"])
e, doc = DocumentService.get_by_id(req["document_id"])
if not e:
return get_data_error_result(retmsg="Document not found!")
deleted_chunk_ids = req["chunk_ids"]
Expand All @@ -587,46 +593,46 @@ def rm_chunk(tenant_id):

@manager.route('/chunk/set', methods=['POST'])
@token_required
@validate_request("doc_id", "chunk_id", "content_with_weight",
"important_kwd")
@validate_request("document_id", "chunk_id", "content",
"important_keywords")
def set(tenant_id):
req = request.json
d = {
"id": req["chunk_id"],
"content_with_weight": req["content_with_weight"]}
d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
"content_with_weight": req["content"]}
d["content_ltks"] = rag_tokenizer.tokenize(req["content"])
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["important_kwd"] = req["important_kwd"]
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
d["important_kwd"] = req["important_keywords"]
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
if "available_int" in req:
d["available_int"] = req["available_int"]

try:
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
tenant_id = DocumentService.get_tenant_id(req["document_id"])
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")

embd_id = DocumentService.get_embd_id(req["doc_id"])
embd_id = DocumentService.get_embd_id(req["document_id"])
embd_mdl = TenantLLMService.model_instance(
tenant_id, LLMType.EMBEDDING.value, embd_id)

e, doc = DocumentService.get_by_id(req["doc_id"])
e, doc = DocumentService.get_by_id(req["document_id"])
if not e:
return get_data_error_result(retmsg="Document not found!")

if doc.parser_id == ParserType.QA:
arr = [
t for t in re.split(
r"[\n\t]",
req["content_with_weight"]) if len(t) > 1]
req["content"]) if len(t) > 1]
if len(arr) != 2:
return get_data_error_result(
retmsg="Q&A must be separated by TAB/ENTER key.")
q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
d = beAdoc(d, arr[0], arr[1], not any(
[rag_tokenizer.is_chinese(t) for t in q + a]))

v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v, c = embd_mdl.encode([doc.name, req["content"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
Expand All @@ -636,13 +642,13 @@ def set(tenant_id):

@manager.route('/retrieval_test', methods=['POST'])
@token_required
@validate_request("kb_id", "question")
@validate_request("knowledgebase_id", "question")
def retrieval_test(tenant_id):
req = request.json
page = int(req.get("page", 1))
size = int(req.get("size", 30))
question = req["question"]
kb_id = req["kb_id"]
kb_id = req["knowledgebase_id"]
if isinstance(kb_id, str): kb_id = [kb_id]
doc_ids = req.get("doc_ids", [])
similarity_threshold = float(req.get("similarity_threshold", 0.2))
Expand Down Expand Up @@ -693,6 +699,7 @@ def retrieval_test(tenant_id):
"content_with_weight": "content",
"doc_id": "document_id",
"important_kwd": "important_keywords",
"docnm_kwd":"document_keyword"
}
rename_chunk={}
for key, value in chunk.items():
Expand Down
10 changes: 5 additions & 5 deletions sdk/python/ragflow/modules/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def delete(self) -> bool:
Delete the chunk in the document.
"""
res = self.post('/doc/chunk/rm',
{"doc_id": self.document_id, 'chunk_ids': [self.id]})
{"document_id": self.document_id, 'chunk_ids': [self.id]})
res = res.json()
if res.get("retmsg") == "success":
return True
Expand All @@ -34,13 +34,13 @@ def save(self) -> bool:
"""
res = self.post('/doc/chunk/set',
{"chunk_id": self.id,
"kb_id": self.knowledgebase_id,
"knowledgebase_id": self.knowledgebase_id,
"name": self.document_name,
"content_with_weight": self.content,
"important_kwd": self.important_keywords,
"content": self.content,
"important_keywords": self.important_keywords,
"create_time": self.create_time,
"create_timestamp_flt": self.create_timestamp_float,
"doc_id": self.document_id,
"document_id": self.document_id,
"status": self.status,
})
res = res.json()
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/ragflow/modules/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int
"""
# Construct the request payload for listing documents
payload = {
"kb_id": self.id,
"knowledgebase_id": self.id,
"keywords": keywords,
"offset": offset,
"limit": limit
Expand Down
18 changes: 9 additions & 9 deletions sdk/python/ragflow/modules/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ def save(self) -> bool:
Save the document details to the server.
"""
res = self.post('/doc/save',
{"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "kb_id": self.knowledgebase_id,
"parser_id": self.parser_method, "parser_config": self.parser_config.to_json(),
{"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "knowledgebase_id": self.knowledgebase_id,
"parser_method": self.parser_method, "parser_config": self.parser_config.to_json(),
"source_type": self.source_type, "type": self.type, "created_by": self.created_by,
"size": self.size, "token_num": self.token_count, "chunk_num": self.chunk_count,
"size": self.size, "token_count": self.token_count, "chunk_count": self.chunk_count,
"progress": self.progress, "progress_msg": self.progress_msg,
"process_begin_at": self.process_begin_at, "process_duation": self.process_duration
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"source_type": self.source_type, "type": self.type, "created_by": self.created_by,
"size": self.size, "token_num": self.token_count, "chunk_num": self.chunk_count,
"size": self.size, "token_count": self.token_count, "chunk_count": self.chunk_count,
"progress": self.progress, "progress_msg": self.progress_msg,
"process_begin_at": self.process_begin_at, "process_duation": self.process_duration

})
Expand All @@ -51,7 +51,7 @@ def delete(self) -> bool:
Delete the document from the server.
"""
res = self.rm('/doc/delete',
{"doc_id": self.id})
{"document_id": self.id})
res = res.json()
if res.get("retmsg") == "success":
return True
Expand Down Expand Up @@ -83,7 +83,7 @@ def async_parse(self):
"""
try:
# Construct request data including document ID and run status (assuming 1 means to run)
data = {"doc_ids": [self.id], "run": 1}
data = {"document_ids": [self.id], "run": 1}

# Send a POST request to the specified parsing status endpoint to start parsing
res = self.post(f'/doc/run', data)
Expand Down Expand Up @@ -112,7 +112,7 @@ def join(self, interval=5, timeout=3600):
start_time = time.time()
while time.time() - start_time < timeout:
# Check the parsing status
res = self.get(f'/doc/{self.id}/status', {"doc_ids": [self.id]})
res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]})
res_data = res.json()
data = res_data.get("data", [])

Expand All @@ -133,7 +133,7 @@ def cancel(self):
"""
try:
# Construct request data, including document ID and action to cancel (assuming 2 means cancel)
data = {"doc_ids": [self.id], "run": 2}
data = {"document_ids": [self.id], "run": 2}

# Send a POST request to the specified parsing status endpoint to cancel parsing
res = self.post(f'/doc/run', data)
Expand Down Expand Up @@ -162,7 +162,7 @@ def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available
list: A list of chunks returned from the API.
"""
data = {
"doc_id": self.id,
"document_id": self.id,
"page": page,
"size": size,
"keywords": keywords,
Expand All @@ -188,7 +188,7 @@ def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available
raise Exception(f"API request failed with status code {res.status_code}")

def add_chunk(self, content: str):
res = self.post('/doc/chunk/create', {"doc_id": self.id, "content_with_weight":content})
res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content})
if res.status_code == 200:
res_data = res.json().get("data")
chunk_data = res_data.get("chunk")
Expand Down
Loading