From d7f6fcd1a4ca80410c8839f43d9dc530c80ee8ff Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Tue, 2 Jul 2024 17:47:00 +0800
Subject: [PATCH 1/6] http api

---
 api/apps/dataset_api.py          | 35 ++++++++++-
 sdk/python/ragflow/ragflow.py    |  4 ++
 sdk/python/test/test_document.py | 99 ++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 9772a2ed916..788706b7223 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -17,7 +17,7 @@
 import re
 import warnings
 
-from flask import request
+from flask import request, make_response, send_from_directory
 from flask_login import login_required, current_user
 from httpx import HTTPError
 
@@ -462,7 +462,7 @@ def list_documents(dataset_id):
 
 # ----------------------------list the chunks of the file-----------------------------------------------------
 
-# ----------------------------delete the chunk-----------------------------------------------------
+# -- --------------------------delete the chunk-----------------------------------------------------
 
 # ----------------------------edit the status of the chunk-----------------------------------------------------
 
@@ -474,3 +474,34 @@ def list_documents(dataset_id):
 
 # ----------------------------retrieval test-----------------------------------------------------
 
+@manager.route('/<dataset_id>/documents/<document_id>', methods=['GET'])
+@login_required
+@validate_request("target_path")
+def download_documents(dataset_id, document_id):
+    # Make sure there is target_path in the request
+    req = request.json
+    target_path = req['target_path']
+
+    try:
+        # Check whether there is this dataset
+        exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
+        if not exist:
+            return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset {dataset_id} cannot be found!")
+
+        # Check whether there is this document
+        exist, document = DocumentService.get_by_id(document_id)
+        if not exist:
+            return construct_json_result(message=f"This document {document_id} cannot be found!",
+                                         code=RetCode.ARGUMENT_ERROR)
+
+        # The process of downloading
+        filename = document['name']
+        response = make_response(send_from_directory
+                                 (target_path, filename.encode('utf-8').decode('utf-8'), as_attachment=True))
+        response.headers["Content-Disposition"] = "attachment; filename={}".format(filename.encode().decode('latin-1'))
+
+        # Download successfully
+        return construct_json_result(data=True, code=RetCode.SUCCESS)
+    # Error
+    except Exception as e:
+        return construct_error_response(e)
diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py
index c6c54668d7e..0ae93c197a8 100644
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@@ -142,3 +142,7 @@ def list_files(self, dataset_id, offset=0, count=-1, order_by="create_time", des
     # ----------------------------get a specific chunk-----------------------------------------------------
 
     # ----------------------------retrieval test-----------------------------------------------------
+    def download_file(self, dataset_id, document_id, target_path):
+        endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}"
+        res = requests.get(endpoint, json={'target_path': target_path}, headers=self.authorization_header)
+        return res.json()
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index 34276353789..814a820e630 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -391,3 +391,102 @@ def test_list_document_with_verifying_order_by_and_ascend(self):
 # ----------------------------get a specific chunk-----------------------------------------------------
 
 # ----------------------------retrieval test-----------------------------------------------------
+    def test_download_nonexistent_document(self):
+        """
+        Test updating a document which does not exist.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_download_nonexistent_document")
+        created_res_id = created_res['data']['dataset_id']
+        # upload files
+        file_paths = ["test_data/test.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res['data'][0]
+        doc_id = data['id']
+        # update file
+        params = {
+            "name": "new_name"
+        }
+        update_res = ragflow.update_file("fake_dataset_id", doc_id, **params)
+        assert (update_res['code'] == RetCode.DATA_ERROR and
+                update_res['message'] == f"This dataset fake_dataset_id cannot be found!")
+
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_update_nonexistent_document")
+        created_res_id = created_res['data']['dataset_id']
+        params = {
+            "name": "new_name"
+        }
+        res = ragflow.update_file(created_res_id, "weird_doc_id", **params)
+        assert res['code'] == RetCode.ARGUMENT_ERROR and res[
+            'message'] == f"This document weird_doc_id cannot be found!"
+
+    def test_download_document_without_local_path(self):
+        """
+        Test updating a document without giving parameters.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_update_document_without_parameters")
+        created_res_id = created_res['data']['dataset_id']
+        # upload files
+        file_paths = ["test_data/test.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res['data'][0]
+        doc_id = data['id']
+        # update file
+        params = {
+        }
+        update_res = ragflow.update_file(created_res_id, doc_id, **params)
+        assert (update_res['code'] == RetCode.DATA_ERROR and
+                update_res['message'] == 'Please input at least one parameter that you want to update!')
+
+    def test_download_document_in_nonexistent_dataset(self):
+        """
+        Test updating a document in the nonexistent dataset.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_update_document_in_nonexistent_dataset")
+        created_res_id = created_res['data']['dataset_id']
+        # upload files
+        file_paths = ["test_data/test.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res['data'][0]
+        doc_id = data['id']
+        # update file
+        params = {
+            "name": "new_name"
+        }
+        update_res = ragflow.update_file("fake_dataset_id", doc_id, **params)
+        assert (update_res['code'] == RetCode.DATA_ERROR and
+                update_res['message'] == f"This dataset fake_dataset_id cannot be found!")
+
+    def test_download_document_with_success(self):
+        """
+        Test the updating of a document's name with success.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_update_document_with_updating_its_name_with_success")
+        created_res_id = created_res['data']['dataset_id']
+        # upload files
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res['data'][0]
+        doc_id = data['id']
+        # update file
+        params = {
+            "name": "new_name.txt"
+        }
+        update_res = ragflow.update_file(created_res_id, doc_id, **params)
+        assert (update_res['code'] == RetCode.SUCCESS and
+                update_res['message'] == 'Success' and update_res['data']['name'] == "new_name.txt")
+
+    # TODO: the local repo may be false
+    # TODO: there's the same name in the local repo

From 18b4b2cc19d887b2c39931280d1305f23b954de0 Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Wed, 3 Jul 2024 15:30:44 +0800
Subject: [PATCH 2/6] completed to return binary data

---
 api/apps/dataset_api.py          |  54 +++++++++-------
 sdk/python/ragflow/ragflow.py    |  12 +++-
 sdk/python/test/test_document.py | 103 +++++++++++--------------------
 3 files changed, 75 insertions(+), 94 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 788706b7223..24306587664 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -12,7 +12,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
+import base64
 import os
 import re
 import warnings
@@ -281,9 +281,12 @@ def upload_documents(dataset_id):
         return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
                                                                       f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
 
+    # no dataset
+    exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
+    if not exist:
+        return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
+
     for file_obj in file_objs:
-        # the content of the file
-        file_content = file_obj.read()
         file_name = file_obj.filename
         # no name
         if not file_name:
@@ -294,15 +297,6 @@ def upload_documents(dataset_id):
         if 'http' in file_name:
             return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
 
-        # the content is empty, raising a warning
-        if file_content == b'':
-            warnings.warn(f"[WARNING]: The file {file_name} is empty.")
-
-    # no dataset
-    exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
-    if not exist:
-        return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
-
     # get the root_folder
     root_folder = FileService.get_root_folder(current_user.id)
     # get the id of the root_folder
@@ -340,8 +334,14 @@ def upload_documents(dataset_id):
             location = filename
             while MINIO.obj_exist(dataset_id, location):
                 location += "_"
+
             blob = file.read()
+            # the content is empty, raising a warning
+            if blob == b'':
+                warnings.warn(f"[WARNING]: The file {filename} is empty.")
+
             MINIO.put(dataset_id, location, blob)
+
             doc = {
                 "id": get_uuid(),
                 "kb_id": dataset.id,
@@ -474,14 +474,9 @@ def list_documents(dataset_id):
 
 # ----------------------------retrieval test-----------------------------------------------------
 
-@manager.route('/<dataset_id>/documents/<document_id>', methods=['GET'])
+@manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
 @login_required
-@validate_request("target_path")
 def download_documents(dataset_id, document_id):
-    # Make sure there is target_path in the request
-    req = request.json
-    target_path = req['target_path']
-
     try:
         # Check whether there is this dataset
         exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
@@ -495,13 +490,24 @@ def download_documents(dataset_id, document_id):
                                          code=RetCode.ARGUMENT_ERROR)
 
         # The process of downloading
-        filename = document['name']
-        response = make_response(send_from_directory
-                                 (target_path, filename.encode('utf-8').decode('utf-8'), as_attachment=True))
-        response.headers["Content-Disposition"] = "attachment; filename={}".format(filename.encode().decode('latin-1'))
-
+        b, n = File2DocumentService.get_minio_address(doc_id=document_id)  # minio address
+        response = make_response(MINIO.get(b, n))
+        extension = re.search(r"\.([^.]+)$", document.name)
+        if extension:
+            if document.type == FileType.VISUAL.value:
+                response.headers.set('Content-Type', 'image/%s' % extension.group(1))
+            else:
+                response.headers.set(
+                    'Content-Type',
+                    'application/%s' %
+                    extension.group(1))
+        print("---response----", response)
+        print("---response json----", response.data)
+        base64_encoded = base64.b64encode(response.data).decode('utf-8')
+        print("---base64----", base64_encoded)
         # Download successfully
-        return construct_json_result(data=True, code=RetCode.SUCCESS)
+        return construct_json_result(code=RetCode.SUCCESS, data=base64_encoded)
     # Error
     except Exception as e:
         return construct_error_response(e)
+
diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py
index 0ae93c197a8..d3c6da0612e 100644
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@@ -12,7 +12,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
+import base64
 import json
 import os
 
@@ -142,7 +142,13 @@ def list_files(self, dataset_id, offset=0, count=-1, order_by="create_time", des
     # ----------------------------get a specific chunk-----------------------------------------------------
 
     # ----------------------------retrieval test-----------------------------------------------------
-    def download_file(self, dataset_id, document_id, target_path):
+    def download_file(self, dataset_id, document_id):
+        # whether path is string
         endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}"
-        res = requests.get(endpoint, json={'target_path': target_path}, headers=self.authorization_header)
+        res = requests.get(endpoint, headers=self.authorization_header)
+        json_data = res.json()
+        if json_data["data"]:
+            base64_encoded = json_data["data"]
+            binary_data = base64.b64decode(base64_encoded)
+            return {"code": RetCode.SUCCESS, "data": binary_data}
         return res.json()
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index 814a820e630..1be99876fc1 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -393,100 +393,69 @@ def test_list_document_with_verifying_order_by_and_ascend(self):
 # ----------------------------retrieval test-----------------------------------------------------
     def test_download_nonexistent_document(self):
         """
-        Test updating a document which does not exist.
+        Test downloading a document which does not exist.
         """
         # create a dataset
         ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
         created_res = ragflow.create_dataset("test_download_nonexistent_document")
-        created_res_id = created_res['data']['dataset_id']
-        # upload files
-        file_paths = ["test_data/test.txt"]
-        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
-        # get the doc_id
-        data = uploading_res['data'][0]
-        doc_id = data['id']
-        # update file
-        params = {
-            "name": "new_name"
-        }
-        update_res = ragflow.update_file("fake_dataset_id", doc_id, **params)
-        assert (update_res['code'] == RetCode.DATA_ERROR and
-                update_res['message'] == f"This dataset fake_dataset_id cannot be found!")
+        created_res_id = created_res["data"]["dataset_id"]
+        res = ragflow.download_file(created_res_id, "imagination")
+        print(res)
+        assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document imagination cannot be found!"
 
-        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
-        created_res = ragflow.create_dataset("test_update_nonexistent_document")
-        created_res_id = created_res['data']['dataset_id']
-        params = {
-            "name": "new_name"
-        }
-        res = ragflow.update_file(created_res_id, "weird_doc_id", **params)
-        assert res['code'] == RetCode.ARGUMENT_ERROR and res[
-            'message'] == f"This document weird_doc_id cannot be found!"
-
-    def test_download_document_without_local_path(self):
+    def test_download_document_in_nonexistent_dataset(self):
         """
-        Test updating a document without giving parameters.
+        Test downloading a document whose dataset is nonexistent.
         """
         # create a dataset
         ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
-        created_res = ragflow.create_dataset("test_update_document_without_parameters")
-        created_res_id = created_res['data']['dataset_id']
+        created_res = ragflow.create_dataset("test_download_nonexistent_document")
+        created_res_id = created_res["data"]["dataset_id"]
         # upload files
         file_paths = ["test_data/test.txt"]
         uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
         # get the doc_id
-        data = uploading_res['data'][0]
-        doc_id = data['id']
-        # update file
-        params = {
-        }
-        update_res = ragflow.update_file(created_res_id, doc_id, **params)
-        assert (update_res['code'] == RetCode.DATA_ERROR and
-                update_res['message'] == 'Please input at least one parameter that you want to update!')
+        data = uploading_res["data"][0]
+        doc_id = data["id"]
+        # download file
+        res = ragflow.download_file("imagination", doc_id)
 
-    def test_download_document_in_nonexistent_dataset(self):
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"This dataset imagination cannot be found!"
+
+    def test_download_document_with_success(self):
         """
-        Test updating a document in the nonexistent dataset.
+        Test the downloading of a document with success.
         """
         # create a dataset
         ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
-        created_res = ragflow.create_dataset("test_update_document_in_nonexistent_dataset")
-        created_res_id = created_res['data']['dataset_id']
+        created_res = ragflow.create_dataset("test_download_nonexistent_document")
+        created_res_id = created_res["data"]["dataset_id"]
         # upload files
         file_paths = ["test_data/test.txt"]
         uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
         # get the doc_id
-        data = uploading_res['data'][0]
-        doc_id = data['id']
-        # update file
-        params = {
-            "name": "new_name"
-        }
-        update_res = ragflow.update_file("fake_dataset_id", doc_id, **params)
-        assert (update_res['code'] == RetCode.DATA_ERROR and
-                update_res['message'] == f"This dataset fake_dataset_id cannot be found!")
+        data = uploading_res["data"][0]
+        doc_id = data["id"]
+        # download file
+        with open("test_data/test.txt", "rb") as file:
+            binary_data = file.read()
+        res = ragflow.download_file(created_res_id, doc_id)
+        assert res["code"] == RetCode.SUCCESS and res["data"] == binary_data
 
-    def test_download_document_with_success(self):
+    def test_download_an_empty_document_with_success(self):
         """
-        Test the updating of a document's name with success.
+        Test the downloading of an empty document with success.
         """
         # create a dataset
         ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
-        created_res = ragflow.create_dataset("test_update_document_with_updating_its_name_with_success")
-        created_res_id = created_res['data']['dataset_id']
+        created_res = ragflow.create_dataset("test_download_nonexistent_document")
+        created_res_id = created_res["data"]["dataset_id"]
         # upload files
-        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        file_paths = ["test_data/empty.txt"]
         uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
         # get the doc_id
-        data = uploading_res['data'][0]
-        doc_id = data['id']
-        # update file
-        params = {
-            "name": "new_name.txt"
-        }
-        update_res = ragflow.update_file(created_res_id, doc_id, **params)
-        assert (update_res['code'] == RetCode.SUCCESS and
-                update_res['message'] == 'Success' and update_res['data']['name'] == "new_name.txt")
-
-    # TODO: the local repo may be false
-    # TODO: there's the same name in the local repo
+        data = uploading_res["data"][0]
+        doc_id = data["id"]
+        # download file
+        res = ragflow.download_file(created_res_id, doc_id)
+        assert res["code"] == RetCode.SUCCESS and res["data"] == ""

From 2493714055bd6bece4c167754ac5cc075f0d9970 Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Wed, 3 Jul 2024 16:08:16 +0800
Subject: [PATCH 3/6] download to the local repo

---
 api/apps/dataset_api.py          | 51 ++++++++++++++++----------------
 sdk/python/ragflow/ragflow.py    | 24 ++++++++-------
 sdk/python/test/test_document.py | 38 +++++++++++++-----------
 3 files changed, 59 insertions(+), 54 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 01154cfcf9a..72559203c04 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -555,27 +555,6 @@ def is_illegal_value_for_enum(value, enum_class):
     return value not in enum_class.__members__.values()
 
 # ----------------------------download a file-----------------------------------------------------
-
-# ----------------------------start parsing-----------------------------------------------------
-
-# ----------------------------stop parsing-----------------------------------------------------
-
-# ----------------------------show the status of the file-----------------------------------------------------
-
-# ----------------------------list the chunks of the file-----------------------------------------------------
-
-# -- --------------------------delete the chunk-----------------------------------------------------
-
-# ----------------------------edit the status of the chunk-----------------------------------------------------
-
-# ----------------------------insert a new chunk-----------------------------------------------------
-
-# ----------------------------upload a file-----------------------------------------------------
-
-# ----------------------------get a specific chunk-----------------------------------------------------
-
-# ----------------------------retrieval test-----------------------------------------------------
-
 @manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
 @login_required
 def download_documents(dataset_id, document_id):
@@ -594,7 +573,8 @@ def download_documents(dataset_id, document_id):
         # The process of downloading
         b, n = File2DocumentService.get_minio_address(doc_id=document_id)  # minio address
         response = make_response(MINIO.get(b, n))
-        extension = re.search(r"\.([^.]+)$", document.name)
+        filename = document.name
+        extension = re.search(r"\.([^.]+)$", filename)
         if extension:
             if document.type == FileType.VISUAL.value:
                 response.headers.set('Content-Type', 'image/%s' % extension.group(1))
@@ -603,13 +583,32 @@ def download_documents(dataset_id, document_id):
                     'Content-Type',
                     'application/%s' %
                     extension.group(1))
-        print("---response----", response)
-        print("---response json----", response.data)
+
         base64_encoded = base64.b64encode(response.data).decode('utf-8')
-        print("---base64----", base64_encoded)
         # Download successfully
-        return construct_json_result(code=RetCode.SUCCESS, data=base64_encoded)
+        return construct_json_result(code=RetCode.SUCCESS, data={"filename": filename, "encoded_data": base64_encoded})
     # Error
     except Exception as e:
         return construct_error_response(e)
+# ----------------------------start parsing-----------------------------------------------------
+
+# ----------------------------stop parsing-----------------------------------------------------
+
+# ----------------------------show the status of the file-----------------------------------------------------
+
+# ----------------------------list the chunks of the file-----------------------------------------------------
+
+# -- --------------------------delete the chunk-----------------------------------------------------
+
+# ----------------------------edit the status of the chunk-----------------------------------------------------
+
+# ----------------------------insert a new chunk-----------------------------------------------------
+
+# ----------------------------upload a file-----------------------------------------------------
+
+# ----------------------------get a specific chunk-----------------------------------------------------
+
+# ----------------------------retrieval test-----------------------------------------------------
+
+
 
diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py
index 46e3aa98933..88bdc0ca542 100644
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@@ -126,7 +126,20 @@ def update_file(self, dataset_id, document_id, **params):
         return response.json()
 
     # ----------------------------download a file-----------------------------------------------------
+    def download_file(self, dataset_id, document_id):
+        # whether path is string
+        endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}"
+        res = requests.get(endpoint, headers=self.authorization_header)
+        json_data = res.json()
 
+        if json_data["data"]:
+            base64_encoded = json_data["data"]["encoded_data"]
+            file_path = os.path.join(os.getcwd(), json_data["data"]["filename"])
+            binary_data = base64.b64decode(base64_encoded)
+            with open(file_path, "wb") as file:
+                file.write(binary_data)
+            return {"code": RetCode.SUCCESS, "data": binary_data}
+        return res.json()
     # ----------------------------start parsing-----------------------------------------------------
 
     # ----------------------------stop parsing-----------------------------------------------------
@@ -144,13 +157,4 @@ def update_file(self, dataset_id, document_id, **params):
     # ----------------------------get a specific chunk-----------------------------------------------------
 
     # ----------------------------retrieval test-----------------------------------------------------
-    def download_file(self, dataset_id, document_id):
-        # whether path is string
-        endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}"
-        res = requests.get(endpoint, headers=self.authorization_header)
-        json_data = res.json()
-        if json_data["data"]:
-            base64_encoded = json_data["data"]
-            binary_data = base64.b64decode(base64_encoded)
-            return {"code": RetCode.SUCCESS, "data": binary_data}
-        return res.json()
+
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index d81afd6b92e..78d3e0bd032 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -625,25 +625,9 @@ def test_update_document_with_giving_illegal_value_for_type(self):
         update_res = ragflow.update_file(created_res_id, doc_id, **params)
         assert (update_res["code"] == RetCode.DATA_ERROR and
                 update_res["message"] == "Illegal value ? for 'template_type' field.")
-# ----------------------------download a file-----------------------------------------------------
-
-# ----------------------------start parsing-----------------------------------------------------
-
-# ----------------------------stop parsing-----------------------------------------------------
-
-# ----------------------------show the status of the file-----------------------------------------------------
-
-# ----------------------------list the chunks of the file-----------------------------------------------------
 
-# ----------------------------delete the chunk-----------------------------------------------------
-
-# ----------------------------edit the status of the chunk-----------------------------------------------------
-
-# ----------------------------insert a new chunk-----------------------------------------------------
-
-# ----------------------------get a specific chunk-----------------------------------------------------
+# ----------------------------download a file-----------------------------------------------------
 
-# ----------------------------retrieval test-----------------------------------------------------
     def test_download_nonexistent_document(self):
         """
         Test downloading a document which does not exist.
@@ -711,4 +695,22 @@ def test_download_an_empty_document_with_success(self):
         doc_id = data["id"]
         # download file
         res = ragflow.download_file(created_res_id, doc_id)
-        assert res["code"] == RetCode.SUCCESS and res["data"] == ""
+        assert res["code"] == RetCode.SUCCESS and res["data"] == b""
+
+# ----------------------------start parsing-----------------------------------------------------
+
+# ----------------------------stop parsing-----------------------------------------------------
+
+# ----------------------------show the status of the file-----------------------------------------------------
+
+# ----------------------------list the chunks of the file-----------------------------------------------------
+
+# ----------------------------delete the chunk-----------------------------------------------------
+
+# ----------------------------edit the status of the chunk-----------------------------------------------------
+
+# ----------------------------insert a new chunk-----------------------------------------------------
+
+# ----------------------------get a specific chunk-----------------------------------------------------
+
+# ----------------------------retrieval test-----------------------------------------------------

From eee518f0927f5c352ed2ed18b7dea4e1dbd1b733 Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Thu, 4 Jul 2024 12:04:05 +0800
Subject: [PATCH 4/6] download from two ends successfully

---
 api/apps/dataset_api.py          | 48 ++++++++++++++++++--------------
 sdk/python/ragflow/ragflow.py    | 21 ++++++++------
 sdk/python/test/common.py        |  2 +-
 sdk/python/test/test_document.py | 12 ++++----
 4 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 72559203c04..8c235588cd7 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -17,10 +17,12 @@
 import pathlib
 import re
 import warnings
+from io import BytesIO
 
-from flask import request, make_response, send_from_directory
+from flask import request, make_response, send_file
 from flask_login import login_required, current_user
 from httpx import HTTPError
+from minio import S3Error
 
 from api.contants import NAME_LENGTH_LIMIT
 from api.db import FileType, ParserType, FileSource
@@ -557,39 +559,43 @@ def is_illegal_value_for_enum(value, enum_class):
 # ----------------------------download a file-----------------------------------------------------
 @manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
 @login_required
-def download_documents(dataset_id, document_id):
+def download_document(dataset_id, document_id):
     try:
         # Check whether there is this dataset
-        exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
+        exist, _ = KnowledgebaseService.get_by_id(dataset_id)
         if not exist:
-            return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset {dataset_id} cannot be found!")
+            return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset '{dataset_id}' cannot be found!")
 
         # Check whether there is this document
         exist, document = DocumentService.get_by_id(document_id)
         if not exist:
-            return construct_json_result(message=f"This document {document_id} cannot be found!",
+            return construct_json_result(message=f"This document '{document_id}' cannot be found!",
                                          code=RetCode.ARGUMENT_ERROR)
 
         # The process of downloading
-        b, n = File2DocumentService.get_minio_address(doc_id=document_id)  # minio address
-        response = make_response(MINIO.get(b, n))
-        filename = document.name
-        extension = re.search(r"\.([^.]+)$", filename)
-        if extension:
-            if document.type == FileType.VISUAL.value:
-                response.headers.set('Content-Type', 'image/%s' % extension.group(1))
-            else:
-                response.headers.set(
-                    'Content-Type',
-                    'application/%s' %
-                    extension.group(1))
-
-        base64_encoded = base64.b64encode(response.data).decode('utf-8')
-        # Download successfully
-        return construct_json_result(code=RetCode.SUCCESS, data={"filename": filename, "encoded_data": base64_encoded})
+        doc_id, doc_location = File2DocumentService.get_minio_address(doc_id=document_id)  # minio address
+        file_stream = MINIO.get(doc_id, doc_location)
+        if not file_stream:
+            return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
+
+        file = BytesIO(file_stream)
+
+        # Use send_file with a proper filename and MIME type
+        try:
+            return send_file(
+                file,
+                as_attachment=True,
+                download_name=document.name,
+                mimetype='application/octet-stream'  # Set a default MIME type
+            )
+        except S3Error as e:
+            # Handle the error from MinIO
+            return construct_json_result(code=RetCode.SERVER_ERROR, message=str(e))
+
     # Error
     except Exception as e:
         return construct_error_response(e)
+
 # ----------------------------start parsing-----------------------------------------------------
 
 # ----------------------------stop parsing-----------------------------------------------------
diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py
index 88bdc0ca542..efa271b17e9 100644
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@@ -18,6 +18,7 @@
 
 import requests
 
+from api.db.services.document_service import DocumentService
 from api.settings import RetCode
 
 
@@ -127,19 +128,21 @@ def update_file(self, dataset_id, document_id, **params):
 
     # ----------------------------download a file-----------------------------------------------------
     def download_file(self, dataset_id, document_id):
-        # whether path is string
         endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}"
         res = requests.get(endpoint, headers=self.authorization_header)
-        json_data = res.json()
 
-        if json_data["data"]:
-            base64_encoded = json_data["data"]["encoded_data"]
-            file_path = os.path.join(os.getcwd(), json_data["data"]["filename"])
-            binary_data = base64.b64decode(base64_encoded)
+        content = res.content  # binary data
+        # decode the binary data
+        try:
+            decoded_content = content.decode("utf-8")
+            json_data = json.loads(decoded_content)
+            return json_data  # message
+        except json.JSONDecodeError:  # binary data
+            _, document = DocumentService.get_by_id(document_id)
+            file_path = os.path.join(os.getcwd(), document.name)
             with open(file_path, "wb") as file:
-                file.write(binary_data)
-            return {"code": RetCode.SUCCESS, "data": binary_data}
-        return res.json()
+                file.write(content)
+            return {"code": RetCode.SUCCESS, "data": content}
     # ----------------------------start parsing-----------------------------------------------------
 
     # ----------------------------stop parsing-----------------------------------------------------
diff --git a/sdk/python/test/common.py b/sdk/python/test/common.py
index 5dd313f5072..94acbf48cab 100644
--- a/sdk/python/test/common.py
+++ b/sdk/python/test/common.py
@@ -1,4 +1,4 @@
 
 
-API_KEY = 'ImFhMmJhZmUwMmQxNzExZWZhZDdmMzA0M2Q3ZWU1MzdlIg.ZnDsIQ.u-0-_qCRU6a4WICxyAPsjaafyOo'
+API_KEY = 'IjJkOGQ4ZDE2MzkyMjExZWZhYTk0MzA0M2Q3ZWU1MzdlIg.ZoUfug.RmqcYyCrlAnLtkzk6bYXiXN3eEY'
 HOST_ADDRESS = 'http://127.0.0.1:9380'
\ No newline at end of file
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index 78d3e0bd032..25f7093500c 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -637,8 +637,7 @@ def test_download_nonexistent_document(self):
         created_res = ragflow.create_dataset("test_download_nonexistent_document")
         created_res_id = created_res["data"]["dataset_id"]
         res = ragflow.download_file(created_res_id, "imagination")
-        print(res)
-        assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document imagination cannot be found!"
+        assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document 'imagination' cannot be found!"
 
     def test_download_document_in_nonexistent_dataset(self):
         """
@@ -656,8 +655,7 @@ def test_download_document_in_nonexistent_dataset(self):
         doc_id = data["id"]
         # download file
         res = ragflow.download_file("imagination", doc_id)
-
-        assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"This dataset imagination cannot be found!"
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"This dataset 'imagination' cannot be found!"
 
     def test_download_document_with_success(self):
         """
@@ -679,9 +677,9 @@ def test_download_document_with_success(self):
         res = ragflow.download_file(created_res_id, doc_id)
         assert res["code"] == RetCode.SUCCESS and res["data"] == binary_data
 
-    def test_download_an_empty_document_with_success(self):
+    def test_download_an_empty_document(self):
         """
-        Test the downloading of an empty document with success.
+        Test the downloading of an empty document.
         """
         # create a dataset
         ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
@@ -695,7 +693,7 @@ def test_download_an_empty_document_with_success(self):
         doc_id = data["id"]
         # download file
         res = ragflow.download_file(created_res_id, doc_id)
-        assert res["code"] == RetCode.SUCCESS and res["data"] == b""
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty."
 
 # ----------------------------start parsing-----------------------------------------------------
 

From 8a97333d59a5c54bb35612a8bf73f530971ba9e8 Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Thu, 4 Jul 2024 13:42:40 +0800
Subject: [PATCH 5/6] optimize imports

---
 api/apps/dataset_api.py          | 3 +--
 sdk/python/ragflow/ragflow.py    | 1 -
 sdk/python/test/test_document.py | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 8c235588cd7..349721cdabe 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -12,14 +12,13 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-import base64
 import os
 import pathlib
 import re
 import warnings
 from io import BytesIO
 
-from flask import request, make_response, send_file
+from flask import request, send_file
 from flask_login import login_required, current_user
 from httpx import HTTPError
 from minio import S3Error
diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py
index efa271b17e9..6275f921c3c 100644
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@@ -12,7 +12,6 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-import base64
 import json
 import os
 
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index 25f7093500c..f7f87a148b8 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -3,7 +3,6 @@
 from ragflow import RAGFlow
 import pytest
 from common import API_KEY, HOST_ADDRESS
-from api.contants import NAME_LENGTH_LIMIT
 
 
 class TestFile(TestSdk):

From 02bc56e7256635f50d4279d13fed3570366718a6 Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Thu, 4 Jul 2024 15:36:51 +0800
Subject: [PATCH 6/6] delete error condition

---
 api/apps/dataset_api.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 349721cdabe..3b290630bdd 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -580,16 +580,12 @@ def download_document(dataset_id, document_id):
         file = BytesIO(file_stream)
 
         # Use send_file with a proper filename and MIME type
-        try:
-            return send_file(
-                file,
-                as_attachment=True,
-                download_name=document.name,
-                mimetype='application/octet-stream'  # Set a default MIME type
-            )
-        except S3Error as e:
-            # Handle the error from MinIO
-            return construct_json_result(code=RetCode.SERVER_ERROR, message=str(e))
+        return send_file(
+            file,
+            as_attachment=True,
+            download_name=document.name,
+            mimetype='application/octet-stream'  # Set a default MIME type
+        )
 
     # Error
     except Exception as e: