From 434abf2d4abdb553ab8f216f7c75bc88daa2fe72 Mon Sep 17 00:00:00 2001
From: GYH <43509927+guoyuhao2330@users.noreply.github.com>
Date: Fri, 17 May 2024 15:58:05 +0800
Subject: [PATCH] 0517 list chunks (#821)

### What problem does this PR solve?

#717

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 api/apps/api_app.py                 | 43 +++++++++++++++++++++++++++++
 api/db/services/document_service.py | 13 +++++++++
 docs/conversation_api.md            | 35 +++++++++++++++++++++++
 3 files changed, 91 insertions(+)

diff --git a/api/apps/api_app.py b/api/apps/api_app.py
index bc4fadf5e2..0494490419 100644
--- a/api/apps/api_app.py
+++ b/api/apps/api_app.py
@@ -39,6 +39,9 @@
 from api.utils.file_utils import filename_type, thumbnail
 from rag.utils.minio_conn import MINIO
 
+from rag.utils.es_conn import ELASTICSEARCH
+from rag.nlp import search
+from elasticsearch_dsl import Q
 
 def generate_confirmation_token(tenent_id):
     serializer = URLSafeTimedSerializer(tenent_id)
@@ -347,3 +350,43 @@ def upload():
                  return server_error_response(e)
 
     return get_json_result(data=doc_result.to_json())
+
+
+@manager.route('/list_chunks', methods=['POST'])
+# @login_required
+def list_chunks():
+    token = request.headers.get('Authorization').split()[1]
+    objs = APIToken.query(token=token)
+    if not objs:
+        return get_json_result(
+            data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR)
+
+    form_data = request.form
+
+    try:
+        if "doc_name" in form_data.keys():
+            tenant_id = DocumentService.get_tenant_id_by_name(form_data['doc_name'])
+            q = Q("match", docnm_kwd=form_data['doc_name'])
+
+        elif "doc_id" in form_data.keys():
+            tenant_id = DocumentService.get_tenant_id(form_data['doc_id'])
+            q = Q("match", doc_id=form_data['doc_id'])
+        else:
+            return get_json_result(
+                data=False,retmsg="Can't find doc_name or doc_id"
+            )
+
+        res_es_search = ELASTICSEARCH.search(q,idxnm=search.index_name(tenant_id),timeout="600s")
+
+        res = [{} for _ in range(len(res_es_search['hits']['hits']))]
+
+        for index , chunk in enumerate(res_es_search['hits']['hits']):
+            res[index]['doc_name'] = chunk['_source']['docnm_kwd']
+            res[index]['content'] = chunk['_source']['content_with_weight']
+            if 'img_id' in chunk['_source'].keys():
+                res[index]['img_id'] = chunk['_source']['img_id']
+
+    except Exception as e:
+        return server_error_response(e)
+
+    return get_json_result(data=res)
diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py
index c85bfcd115..1bb5015928 100644
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@@ -166,6 +166,19 @@ def get_tenant_id(cls, doc_id):
             return
         return docs[0]["tenant_id"]
 
+    @classmethod
+    @DB.connection_context()
+    def get_tenant_id_by_name(cls, name):
+        docs = cls.model.select(
+            Knowledgebase.tenant_id).join(
+            Knowledgebase, on=(
+                    Knowledgebase.id == cls.model.kb_id)).where(
+            cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value)
+        docs = docs.dicts()
+        if not docs:
+            return
+        return docs[0]["tenant_id"]
+
     @classmethod
     @DB.connection_context()
     def get_thumbnails(cls, docids):
diff --git a/docs/conversation_api.md b/docs/conversation_api.md
index 13acec8d9e..bc40983e37 100644
--- a/docs/conversation_api.md
+++ b/docs/conversation_api.md
@@ -364,3 +364,38 @@ This is usually used when upload a file to.
 }
 
 ```
+
+## Get document chunks
+
+Get the chunks of the document based on doc_name or doc_id.
+### Path: /api/list_chunks/
+### Method: POST
+
+### Parameter:
+
+| Name     | Type   | Optional | Description                     |
+|----------|--------|----------|---------------------------------|
+| `doc_name` | string | Yes      | The name of the document in the knowledge base. It must not be empty if `doc_id` is not set.|
+| `doc_id`   | string | Yes      | The ID of the document in the knowledge base. It must not be empty if `doc_name` is not set.|
+
+
+### Response 
+```json
+{
+    "data": [
+        {
+            "content": "Figure 14: Per-request neural-net processingof RL-Cache.\n103\n(sn)\nCPU\n 102\nGPU\n8101\n100\n8\n16 64 256 1K\n4K",
+            "doc_name": "RL-Cache.pdf",
+            "img_id": "0335167613f011ef91240242ac120006-b46c3524952f82dbe061ce9b123f2211"
+        },
+        {
+            "content": "4.3 ProcessingOverheadof RL-CacheACKNOWLEDGMENTSThis section evaluates how e￿ectively our RL-Cache implemen-tation leverages modern multi-core CPUs and GPUs to keep the per-request neural-net processing overhead low. Figure 14 depictsThis researchwas supported inpart by the Regional Government of Madrid (grant P2018/TCS-4499, EdgeData-CM)andU.S. National Science Foundation (grants CNS-1763617 andCNS-1717179).REFERENCES",
+            "doc_name": "RL-Cache.pdf",
+            "img_id": "0335167613f011ef91240242ac120006-d4c12c43938eb55d2d8278eea0d7e6d7"
+        }
+    ],
+    "retcode": 0,
+    "retmsg": "success"
+}
+
+```