Add upload file by knowledge base name API. (#539)

### What problem does this PR solve? Add upload file by knowledge base name API. ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update --------- Co-authored-by: chrysanthemum-boy <[email protected]>
infiniflow · Apr 25, 2024 · 26003b5 · 26003b5
1 parent 4130e5c
commit 26003b5
Show file tree

Hide file tree

Showing 3 changed files with 155 additions and 4 deletions.
diff --git a/api/apps/api_app.py b/api/apps/api_app.py
@@ -13,18 +13,28 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import os
+import re
 from datetime import datetime, timedelta
 from flask import request
 from flask_login import login_required, current_user
+
+from api.db import FileType, ParserType
 from api.db.db_models import APIToken, API4Conversation
+from api.db.services import duplicate_name
 from api.db.services.api_service import APITokenService, API4ConversationService
 from api.db.services.dialog_service import DialogService, chat
+from api.db.services.document_service import DocumentService
+from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.user_service import UserTenantService
 from api.settings import RetCode
 from api.utils import get_uuid, current_timestamp, datetime_format
 from api.utils.api_utils import server_error_response, get_data_error_result, get_json_result, validate_request
 from itsdangerous import URLSafeTimedSerializer
 
+from api.utils.file_utils import filename_type, thumbnail
+from rag.utils import MINIO
+
 
 def generate_confirmation_token(tenent_id):
     serializer = URLSafeTimedSerializer(tenent_id)
@@ -191,4 +201,74 @@ def get(conversation_id):
 
         return get_json_result(data=conv.to_dict())
     except Exception as e:
-        return server_error_response(e)
+        return server_error_response(e)
+
+
+@manager.route('/document/upload', methods=['POST'])
+@validate_request("kb_name")
+def upload():
+    token = request.headers.get('Authorization').split()[1]
+    objs = APIToken.query(token=token)
+    if not objs:
+        return get_json_result(
+            data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR)
+
+    kb_name = request.form.get("kb_name").strip()
+    tenant_id = objs[0].tenant_id
+
+    try:
+        e, kb = KnowledgebaseService.get_by_name(kb_name, tenant_id)
+        if not e:
+            return get_data_error_result(
+                retmsg="Can't find this knowledgebase!")
+        kb_id = kb.id
+    except Exception as e:
+        return server_error_response(e)
+
+    if 'file' not in request.files:
+        return get_json_result(
+            data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
+
+    file = request.files['file']
+    if file.filename == '':
+        return get_json_result(
+            data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
+    try:
+        if DocumentService.get_doc_count(kb.tenant_id) >= int(os.environ.get('MAX_FILE_NUM_PER_USER', 8192)):
+            return get_data_error_result(
+                retmsg="Exceed the maximum file number of a free user!")
+
+        filename = duplicate_name(
+            DocumentService.query,
+            name=file.filename,
+            kb_id=kb_id)
+        filetype = filename_type(filename)
+        if not filetype:
+            return get_data_error_result(
+                retmsg="This type of file has not been supported yet!")
+
+        location = filename
+        while MINIO.obj_exist(kb_id, location):
+            location += "_"
+        blob = request.files['file'].read()
+        MINIO.put(kb_id, location, blob)
+        doc = {
+            "id": get_uuid(),
+            "kb_id": kb.id,
+            "parser_id": kb.parser_id,
+            "parser_config": kb.parser_config,
+            "created_by": kb.tenant_id,
+            "type": filetype,
+            "name": filename,
+            "location": location,
+            "size": len(blob),
+            "thumbnail": thumbnail(filename, blob)
+        }
+        if doc["type"] == FileType.VISUAL:
+            doc["parser_id"] = ParserType.PICTURE.value
+        if re.search(r"\.(ppt|pptx|pages)$", filename):
+            doc["parser_id"] = ParserType.PRESENTATION.value
+        doc = DocumentService.insert(doc)
+        return get_json_result(data=doc.to_json())
+    except Exception as e:
+        return server_error_response(e)
diff --git a/api/db/services/knowledgebase_service.py b/api/db/services/knowledgebase_service.py
@@ -27,7 +27,8 @@ def get_by_tenant_ids(cls, joined_tenant_ids, user_id,
                           page_number, items_per_page, orderby, desc):
         kbs = cls.model.select().where(
             ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
-             TenantPermission.TEAM.value)) | (cls.model.tenant_id == user_id))
+                                                            TenantPermission.TEAM.value)) | (
+                         cls.model.tenant_id == user_id))
             & (cls.model.status == StatusEnum.VALID.value)
         )
         if desc:
@@ -56,7 +57,8 @@ def get_detail(cls, kb_id):
             cls.model.chunk_num,
             cls.model.parser_id,
             cls.model.parser_config]
-        kbs = cls.model.select(*fields).join(Tenant, on=((Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where(
+        kbs = cls.model.select(*fields).join(Tenant, on=(
+                    (Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where(
             (cls.model.id == kb_id),
             (cls.model.status == StatusEnum.VALID.value)
         )
@@ -86,6 +88,7 @@ def dfs_update(old, new):
                     old[k] = list(set(old[k] + v))
                 else:
                     old[k] = v
+
         dfs_update(m.parser_config, config)
         cls.update_by_id(id, {"parser_config": m.parser_config})
 
@@ -97,3 +100,15 @@ def get_field_map(cls, ids):
             if k.parser_config and "field_map" in k.parser_config:
                 conf.update(k.parser_config["field_map"])
         return conf
+
+    @classmethod
+    @DB.connection_context()
+    def get_by_name(cls, kb_name, tenant_id):
+        kb = cls.model.select().where(
+            (cls.model.name == kb_name)
+            & (cls.model.tenant_id == tenant_id)
+            & (cls.model.status == StatusEnum.VALID.value)
+        )
+        if kb:
+            return True, kb[0]
+        return False, None
diff --git a/docs/conversation_api.md b/docs/conversation_api.md
@@ -303,5 +303,61 @@ This will be called to get the answer to users' questions.
 ## Get document content or image
 
 This is usually used when display content of citation.
-### Path: /document/get/\<id\>
+### Path: /api/document/get/\<id\>
 ### Method: GET
+
+## Upload file
+
+This is usually used when upload a file to.
+### Path: /api/document/upload/
+### Method: POST
+
+### Parameter:
+
+| name    | type   | optional | description                            |
+|---------|--------|----------|----------------------------------------|
+| file    | file   | No       | Upload file.                           |
+| kb_name | string | No       | Choose the upload knowledge base name. |
+
+### Response 
+```json
+{
+    "data": {
+        "chunk_num": 0,
+        "create_date": "Thu, 25 Apr 2024 14:30:06 GMT",
+        "create_time": 1714026606921,
+        "created_by": "553ec818fd5711ee8ea63043d7ed348e",
+        "id": "41e9324602cd11ef9f5f3043d7ed348e",
+        "kb_id": "06802686c0a311ee85d6246e9694c130",
+        "location": "readme.txt",
+        "name": "readme.txt",
+        "parser_config": {
+            "field_map": {
+            },
+            "pages": [
+                [
+                    0,
+                    1000000
+                ]
+            ]
+        },
+        "parser_id": "general",
+        "process_begin_at": null,
+        "process_duation": 0.0,
+        "progress": 0.0,
+        "progress_msg": "",
+        "run": "0",
+        "size": 929,
+        "source_type": "local",
+        "status": "1",
+        "thumbnail": null,
+        "token_num": 0,
+        "type": "doc",
+        "update_date": "Thu, 25 Apr 2024 14:30:06 GMT",
+        "update_time": 1714026606921
+    },
+    "retcode": 0,
+    "retmsg": "success"
+}
+
+```