From 434abf2d4abdb553ab8f216f7c75bc88daa2fe72 Mon Sep 17 00:00:00 2001 From: GYH <43509927+guoyuhao2330@users.noreply.github.com> Date: Fri, 17 May 2024 15:58:05 +0800 Subject: [PATCH] 0517 list chunks (#821) ### What problem does this PR solve? #717 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/apps/api_app.py | 43 +++++++++++++++++++++++++++++ api/db/services/document_service.py | 13 +++++++++ docs/conversation_api.md | 35 +++++++++++++++++++++++ 3 files changed, 91 insertions(+) diff --git a/api/apps/api_app.py b/api/apps/api_app.py index bc4fadf5e2..0494490419 100644 --- a/api/apps/api_app.py +++ b/api/apps/api_app.py @@ -39,6 +39,9 @@ from api.utils.file_utils import filename_type, thumbnail from rag.utils.minio_conn import MINIO +from rag.utils.es_conn import ELASTICSEARCH +from rag.nlp import search +from elasticsearch_dsl import Q def generate_confirmation_token(tenent_id): serializer = URLSafeTimedSerializer(tenent_id) @@ -347,3 +350,43 @@ def upload(): return server_error_response(e) return get_json_result(data=doc_result.to_json()) + + +@manager.route('/list_chunks', methods=['POST']) +# @login_required +def list_chunks(): + token = request.headers.get('Authorization').split()[1] + objs = APIToken.query(token=token) + if not objs: + return get_json_result( + data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR) + + form_data = request.form + + try: + if "doc_name" in form_data.keys(): + tenant_id = DocumentService.get_tenant_id_by_name(form_data['doc_name']) + q = Q("match", docnm_kwd=form_data['doc_name']) + + elif "doc_id" in form_data.keys(): + tenant_id = DocumentService.get_tenant_id(form_data['doc_id']) + q = Q("match", doc_id=form_data['doc_id']) + else: + return get_json_result( + data=False,retmsg="Can't find doc_name or doc_id" + ) + + res_es_search = ELASTICSEARCH.search(q,idxnm=search.index_name(tenant_id),timeout="600s") + + res = [{} for _ in range(len(res_es_search['hits']['hits']))] + + for index , chunk in enumerate(res_es_search['hits']['hits']): + res[index]['doc_name'] = chunk['_source']['docnm_kwd'] + res[index]['content'] = chunk['_source']['content_with_weight'] + if 'img_id' in chunk['_source'].keys(): + res[index]['img_id'] = chunk['_source']['img_id'] + + except Exception as e: + return server_error_response(e) + + return get_json_result(data=res) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index c85bfcd115..1bb5015928 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -166,6 +166,19 @@ def get_tenant_id(cls, doc_id): return return docs[0]["tenant_id"] + @classmethod + @DB.connection_context() + def get_tenant_id_by_name(cls, name): + docs = cls.model.select( + Knowledgebase.tenant_id).join( + Knowledgebase, on=( + Knowledgebase.id == cls.model.kb_id)).where( + cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value) + docs = docs.dicts() + if not docs: + return + return docs[0]["tenant_id"] + @classmethod @DB.connection_context() def get_thumbnails(cls, docids): diff --git a/docs/conversation_api.md b/docs/conversation_api.md index 13acec8d9e..bc40983e37 100644 --- a/docs/conversation_api.md +++ b/docs/conversation_api.md @@ -364,3 +364,38 @@ This is usually used when upload a file to. } ``` + +## Get document chunks + +Get the chunks of the document based on doc_name or doc_id. +### Path: /api/list_chunks/ +### Method: POST + +### Parameter: + +| Name | Type | Optional | Description | +|----------|--------|----------|---------------------------------| +| `doc_name` | string | Yes | The name of the document in the knowledge base. It must not be empty if `doc_id` is not set.| +| `doc_id` | string | Yes | The ID of the document in the knowledge base. It must not be empty if `doc_name` is not set.| + + +### Response +```json +{ + "data": [ + { + "content": "Figure 14: Per-request neural-net processingof RL-Cache.\n103\n(sn)\nCPU\n 102\nGPU\n8101\n100\n8\n16 64 256 1K\n4K", + "doc_name": "RL-Cache.pdf", + "img_id": "0335167613f011ef91240242ac120006-b46c3524952f82dbe061ce9b123f2211" + }, + { + "content": "4.3 ProcessingOverheadof RL-CacheACKNOWLEDGMENTSThis section evaluates how e￿ectively our RL-Cache implemen-tation leverages modern multi-core CPUs and GPUs to keep the per-request neural-net processing overhead low. Figure 14 depictsThis researchwas supported inpart by the Regional Government of Madrid (grant P2018/TCS-4499, EdgeData-CM)andU.S. National Science Foundation (grants CNS-1763617 andCNS-1717179).REFERENCES", + "doc_name": "RL-Cache.pdf", + "img_id": "0335167613f011ef91240242ac120006-d4c12c43938eb55d2d8278eea0d7e6d7" + } + ], + "retcode": 0, + "retmsg": "success" +} + +```