Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues in API #3008

Merged
merged 2 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions api/apps/sdk/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ def create(tenant_id):
if not req.get("embedding_model"):
req['embedding_model'] = t.embd_id
else:
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
"BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
"nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
"text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
and req.get("embedding_model") not in valid_embedding_models:
return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
key_mapping = {
"chunk_num": "chunk_count",
Expand Down Expand Up @@ -133,6 +138,9 @@ def update(tenant_id,dataset_id):
return get_error_data_result(
retmsg="Can't change `tenant_id`.")
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if "parser_config" in req:
print(kb.parser_config,flush=True)
req["parser_config"]=kb.parser_config.update(req["parser_config"])
if "chunk_count" in req:
if req["chunk_count"] != kb.chunk_num:
return get_error_data_result(
Expand All @@ -153,10 +161,15 @@ def update(tenant_id,dataset_id):
if "embedding_model" in req:
if kb.chunk_num != 0 and req['embedding_model'] != kb.embd_id:
return get_error_data_result(
retmsg="If `chunk_count` is not 0, `embedding_method` is not changeable.")
retmsg="If `chunk_count` is not 0, `embedding_model` is not changeable.")
if not req.get("embedding_model"):
return get_error_data_result("`embedding_model` can't be empty")
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
"BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
"nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
"text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
and req.get("embedding_model") not in valid_embedding_models:
return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
req['embd_id'] = req.pop('embedding_model')
if "name" in req:
Expand Down
48 changes: 38 additions & 10 deletions api/apps/sdk/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,6 @@ def update_doc(tenant_id, dataset_id, document_id):
doc.process_duation * -1)
if not e:
return get_error_data_result(retmsg="Document not found!")
tenant_id = DocumentService.get_tenant_id(req["id"])
if not tenant_id:
return get_error_data_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))

Expand Down Expand Up @@ -245,14 +242,22 @@ def delete(tenant_id,dataset_id):
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
req = request.json
if not req.get("ids"):
return get_error_data_result(retmsg="`ids` is required")
doc_ids = req["ids"]
if not req:
doc_ids=None
else:
doc_ids=req.get("ids")
if not doc_ids:
doc_list = []
docs=DocumentService.query(kb_id=dataset_id)
for doc in docs:
doc_list.append(doc.id)
else:
doc_list=doc_ids
root_folder = FileService.get_root_folder(tenant_id)
pf_id = root_folder["id"]
FileService.init_knowledgebase_docs(pf_id, tenant_id)
errors = ""
for doc_id in doc_ids:
for doc_id in doc_list:
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
Expand Down Expand Up @@ -290,8 +295,11 @@ def parse(tenant_id,dataset_id):
if not req.get("document_ids"):
return get_error_data_result("`document_ids` is required")
for id in req["document_ids"]:
if not DocumentService.query(id=id,kb_id=dataset_id):
doc = DocumentService.query(id=id,kb_id=dataset_id)
if not doc:
return get_error_data_result(retmsg=f"You don't own the document {id}.")
if doc[0].progress != 0.0:
return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
info = {"run": "1", "progress": 0}
info["progress_msg"] = ""
info["chunk_num"] = 0
Expand Down Expand Up @@ -349,7 +357,27 @@ def list_chunks(tenant_id,dataset_id,document_id):
"doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
}
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
key_mapping = {
"chunk_num": "chunk_count",
"kb_id": "dataset_id",
"token_num": "token_count",
"parser_id": "chunk_method"
}
run_mapping = {
"0": "UNSTART",
"1": "RUNNING",
"2": "CANCEL",
"3": "DONE",
"4": "FAIL"
}
doc=doc.to_dict()
renamed_doc = {}
for key, value in doc.items():
if key == "run":
renamed_doc["run"] = run_mapping.get(str(value))
new_key = key_mapping.get(key, key)
renamed_doc[new_key] = value
res = {"total": sres.total, "chunks": [], "doc": renamed_doc}
origin_chunks = []
sign = 0
for id in sres.ids:
Expand Down Expand Up @@ -388,7 +416,7 @@ def list_chunks(tenant_id,dataset_id,document_id):
"content_with_weight": "content",
"doc_id": "document_id",
"important_kwd": "important_keywords",
"img_id": "image_id",
"img_id": "image_id"
}
renamed_chunk = {}
for key, value in chunk.items():
Expand Down
Loading