From d7f6fcd1a4ca80410c8839f43d9dc530c80ee8ff Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Tue, 2 Jul 2024 17:47:00 +0800 Subject: [PATCH 1/6] http api --- api/apps/dataset_api.py | 35 ++++++++++- sdk/python/ragflow/ragflow.py | 4 ++ sdk/python/test/test_document.py | 99 ++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 2 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 9772a2ed916..788706b7223 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -17,7 +17,7 @@ import re import warnings -from flask import request +from flask import request, make_response, send_from_directory from flask_login import login_required, current_user from httpx import HTTPError @@ -462,7 +462,7 @@ def list_documents(dataset_id): # ----------------------------list the chunks of the file----------------------------------------------------- -# ----------------------------delete the chunk----------------------------------------------------- +# -- --------------------------delete the chunk----------------------------------------------------- # ----------------------------edit the status of the chunk----------------------------------------------------- @@ -474,3 +474,34 @@ def list_documents(dataset_id): # ----------------------------retrieval test----------------------------------------------------- +@manager.route('//documents/', methods=['GET']) +@login_required +@validate_request("target_path") +def download_documents(dataset_id, document_id): + # Make sure there is target_path in the request + req = request.json + target_path = req['target_path'] + + try: + # Check whether there is this dataset + exist, dataset = KnowledgebaseService.get_by_id(dataset_id) + if not exist: + return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset {dataset_id} cannot be found!") + + # Check whether there is this document + exist, document = DocumentService.get_by_id(document_id) + if not exist: + return construct_json_result(message=f"This document {document_id} cannot be found!", + code=RetCode.ARGUMENT_ERROR) + + # The process of downloading + filename = document['name'] + response = make_response(send_from_directory + (target_path, filename.encode('utf-8').decode('utf-8'), as_attachment=True)) + response.headers["Content-Disposition"] = "attachment; filename={}".format(filename.encode().decode('latin-1')) + + # Download successfully + return construct_json_result(data=True, code=RetCode.SUCCESS) + # Error + except Exception as e: + return construct_error_response(e) diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index c6c54668d7e..0ae93c197a8 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -142,3 +142,7 @@ def list_files(self, dataset_id, offset=0, count=-1, order_by="create_time", des # ----------------------------get a specific chunk----------------------------------------------------- # ----------------------------retrieval test----------------------------------------------------- + def download_file(self, dataset_id, document_id, target_path): + endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}" + res = requests.get(endpoint, json={'target_path': target_path}, headers=self.authorization_header) + return res.json() diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index 34276353789..814a820e630 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -391,3 +391,102 @@ def test_list_document_with_verifying_order_by_and_ascend(self): # ----------------------------get a specific chunk----------------------------------------------------- # ----------------------------retrieval test----------------------------------------------------- + def test_download_nonexistent_document(self): + """ + Test updating a document which does not exist. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_download_nonexistent_document") + created_res_id = created_res['data']['dataset_id'] + # upload files + file_paths = ["test_data/test.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res['data'][0] + doc_id = data['id'] + # update file + params = { + "name": "new_name" + } + update_res = ragflow.update_file("fake_dataset_id", doc_id, **params) + assert (update_res['code'] == RetCode.DATA_ERROR and + update_res['message'] == f"This dataset fake_dataset_id cannot be found!") + + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_update_nonexistent_document") + created_res_id = created_res['data']['dataset_id'] + params = { + "name": "new_name" + } + res = ragflow.update_file(created_res_id, "weird_doc_id", **params) + assert res['code'] == RetCode.ARGUMENT_ERROR and res[ + 'message'] == f"This document weird_doc_id cannot be found!" + + def test_download_document_without_local_path(self): + """ + Test updating a document without giving parameters. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_update_document_without_parameters") + created_res_id = created_res['data']['dataset_id'] + # upload files + file_paths = ["test_data/test.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res['data'][0] + doc_id = data['id'] + # update file + params = { + } + update_res = ragflow.update_file(created_res_id, doc_id, **params) + assert (update_res['code'] == RetCode.DATA_ERROR and + update_res['message'] == 'Please input at least one parameter that you want to update!') + + def test_download_document_in_nonexistent_dataset(self): + """ + Test updating a document in the nonexistent dataset. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_update_document_in_nonexistent_dataset") + created_res_id = created_res['data']['dataset_id'] + # upload files + file_paths = ["test_data/test.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res['data'][0] + doc_id = data['id'] + # update file + params = { + "name": "new_name" + } + update_res = ragflow.update_file("fake_dataset_id", doc_id, **params) + assert (update_res['code'] == RetCode.DATA_ERROR and + update_res['message'] == f"This dataset fake_dataset_id cannot be found!") + + def test_download_document_with_success(self): + """ + Test the updating of a document's name with success. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_update_document_with_updating_its_name_with_success") + created_res_id = created_res['data']['dataset_id'] + # upload files + file_paths = ["test_data/test.txt", "test_data/test1.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res['data'][0] + doc_id = data['id'] + # update file + params = { + "name": "new_name.txt" + } + update_res = ragflow.update_file(created_res_id, doc_id, **params) + assert (update_res['code'] == RetCode.SUCCESS and + update_res['message'] == 'Success' and update_res['data']['name'] == "new_name.txt") + + # TODO: the local repo may be false + # TODO: there's the same name in the local repo From 18b4b2cc19d887b2c39931280d1305f23b954de0 Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Wed, 3 Jul 2024 15:30:44 +0800 Subject: [PATCH 2/6] completed to return binary data --- api/apps/dataset_api.py | 54 +++++++++------- sdk/python/ragflow/ragflow.py | 12 +++- sdk/python/test/test_document.py | 103 +++++++++++-------------------- 3 files changed, 75 insertions(+), 94 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 788706b7223..24306587664 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import base64 import os import re import warnings @@ -281,9 +281,12 @@ def upload_documents(dataset_id): return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") + # no dataset + exist, dataset = KnowledgebaseService.get_by_id(dataset_id) + if not exist: + return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) + for file_obj in file_objs: - # the content of the file - file_content = file_obj.read() file_name = file_obj.filename # no name if not file_name: @@ -294,15 +297,6 @@ def upload_documents(dataset_id): if 'http' in file_name: return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") - # the content is empty, raising a warning - if file_content == b'': - warnings.warn(f"[WARNING]: The file {file_name} is empty.") - - # no dataset - exist, dataset = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) - # get the root_folder root_folder = FileService.get_root_folder(current_user.id) # get the id of the root_folder @@ -340,8 +334,14 @@ def upload_documents(dataset_id): location = filename while MINIO.obj_exist(dataset_id, location): location += "_" + blob = file.read() + # the content is empty, raising a warning + if blob == b'': + warnings.warn(f"[WARNING]: The file {filename} is empty.") + MINIO.put(dataset_id, location, blob) + doc = { "id": get_uuid(), "kb_id": dataset.id, @@ -474,14 +474,9 @@ def list_documents(dataset_id): # ----------------------------retrieval test----------------------------------------------------- -@manager.route('//documents/', methods=['GET']) +@manager.route("//documents/", methods=["GET"]) @login_required -@validate_request("target_path") def download_documents(dataset_id, document_id): - # Make sure there is target_path in the request - req = request.json - target_path = req['target_path'] - try: # Check whether there is this dataset exist, dataset = KnowledgebaseService.get_by_id(dataset_id) @@ -495,13 +490,24 @@ def download_documents(dataset_id, document_id): code=RetCode.ARGUMENT_ERROR) # The process of downloading - filename = document['name'] - response = make_response(send_from_directory - (target_path, filename.encode('utf-8').decode('utf-8'), as_attachment=True)) - response.headers["Content-Disposition"] = "attachment; filename={}".format(filename.encode().decode('latin-1')) - + b, n = File2DocumentService.get_minio_address(doc_id=document_id) # minio address + response = make_response(MINIO.get(b, n)) + extension = re.search(r"\.([^.]+)$", document.name) + if extension: + if document.type == FileType.VISUAL.value: + response.headers.set('Content-Type', 'image/%s' % extension.group(1)) + else: + response.headers.set( + 'Content-Type', + 'application/%s' % + extension.group(1)) + print("---response----", response) + print("---response json----", response.data) + base64_encoded = base64.b64encode(response.data).decode('utf-8') + print("---base64----", base64_encoded) # Download successfully - return construct_json_result(data=True, code=RetCode.SUCCESS) + return construct_json_result(code=RetCode.SUCCESS, data=base64_encoded) # Error except Exception as e: return construct_error_response(e) + diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index 0ae93c197a8..d3c6da0612e 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import base64 import json import os @@ -142,7 +142,13 @@ def list_files(self, dataset_id, offset=0, count=-1, order_by="create_time", des # ----------------------------get a specific chunk----------------------------------------------------- # ----------------------------retrieval test----------------------------------------------------- - def download_file(self, dataset_id, document_id, target_path): + def download_file(self, dataset_id, document_id): + # whether path is string endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}" - res = requests.get(endpoint, json={'target_path': target_path}, headers=self.authorization_header) + res = requests.get(endpoint, headers=self.authorization_header) + json_data = res.json() + if json_data["data"]: + base64_encoded = json_data["data"] + binary_data = base64.b64decode(base64_encoded) + return {"code": RetCode.SUCCESS, "data": binary_data} return res.json() diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index 814a820e630..1be99876fc1 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -393,100 +393,69 @@ def test_list_document_with_verifying_order_by_and_ascend(self): # ----------------------------retrieval test----------------------------------------------------- def test_download_nonexistent_document(self): """ - Test updating a document which does not exist. + Test downloading a document which does not exist. """ # create a dataset ragflow = RAGFlow(API_KEY, HOST_ADDRESS) created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res['data']['dataset_id'] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res['data'][0] - doc_id = data['id'] - # update file - params = { - "name": "new_name" - } - update_res = ragflow.update_file("fake_dataset_id", doc_id, **params) - assert (update_res['code'] == RetCode.DATA_ERROR and - update_res['message'] == f"This dataset fake_dataset_id cannot be found!") + created_res_id = created_res["data"]["dataset_id"] + res = ragflow.download_file(created_res_id, "imagination") + print(res) + assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document imagination cannot be found!" - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_nonexistent_document") - created_res_id = created_res['data']['dataset_id'] - params = { - "name": "new_name" - } - res = ragflow.update_file(created_res_id, "weird_doc_id", **params) - assert res['code'] == RetCode.ARGUMENT_ERROR and res[ - 'message'] == f"This document weird_doc_id cannot be found!" - - def test_download_document_without_local_path(self): + def test_download_document_in_nonexistent_dataset(self): """ - Test updating a document without giving parameters. + Test downloading a document whose dataset is nonexistent. """ # create a dataset ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_without_parameters") - created_res_id = created_res['data']['dataset_id'] + created_res = ragflow.create_dataset("test_download_nonexistent_document") + created_res_id = created_res["data"]["dataset_id"] # upload files file_paths = ["test_data/test.txt"] uploading_res = ragflow.upload_local_file(created_res_id, file_paths) # get the doc_id - data = uploading_res['data'][0] - doc_id = data['id'] - # update file - params = { - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res['code'] == RetCode.DATA_ERROR and - update_res['message'] == 'Please input at least one parameter that you want to update!') + data = uploading_res["data"][0] + doc_id = data["id"] + # download file + res = ragflow.download_file("imagination", doc_id) - def test_download_document_in_nonexistent_dataset(self): + assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"This dataset imagination cannot be found!" + + def test_download_document_with_success(self): """ - Test updating a document in the nonexistent dataset. + Test the downloading of a document with success. """ # create a dataset ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_in_nonexistent_dataset") - created_res_id = created_res['data']['dataset_id'] + created_res = ragflow.create_dataset("test_download_nonexistent_document") + created_res_id = created_res["data"]["dataset_id"] # upload files file_paths = ["test_data/test.txt"] uploading_res = ragflow.upload_local_file(created_res_id, file_paths) # get the doc_id - data = uploading_res['data'][0] - doc_id = data['id'] - # update file - params = { - "name": "new_name" - } - update_res = ragflow.update_file("fake_dataset_id", doc_id, **params) - assert (update_res['code'] == RetCode.DATA_ERROR and - update_res['message'] == f"This dataset fake_dataset_id cannot be found!") + data = uploading_res["data"][0] + doc_id = data["id"] + # download file + with open("test_data/test.txt", "rb") as file: + binary_data = file.read() + res = ragflow.download_file(created_res_id, doc_id) + assert res["code"] == RetCode.SUCCESS and res["data"] == binary_data - def test_download_document_with_success(self): + def test_download_an_empty_document_with_success(self): """ - Test the updating of a document's name with success. + Test the downloading of an empty document with success. """ # create a dataset ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_updating_its_name_with_success") - created_res_id = created_res['data']['dataset_id'] + created_res = ragflow.create_dataset("test_download_nonexistent_document") + created_res_id = created_res["data"]["dataset_id"] # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] + file_paths = ["test_data/empty.txt"] uploading_res = ragflow.upload_local_file(created_res_id, file_paths) # get the doc_id - data = uploading_res['data'][0] - doc_id = data['id'] - # update file - params = { - "name": "new_name.txt" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res['code'] == RetCode.SUCCESS and - update_res['message'] == 'Success' and update_res['data']['name'] == "new_name.txt") - - # TODO: the local repo may be false - # TODO: there's the same name in the local repo + data = uploading_res["data"][0] + doc_id = data["id"] + # download file + res = ragflow.download_file(created_res_id, doc_id) + assert res["code"] == RetCode.SUCCESS and res["data"] == "" From 2493714055bd6bece4c167754ac5cc075f0d9970 Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Wed, 3 Jul 2024 16:08:16 +0800 Subject: [PATCH 3/6] download to the local repo --- api/apps/dataset_api.py | 51 ++++++++++++++++---------------- sdk/python/ragflow/ragflow.py | 24 ++++++++------- sdk/python/test/test_document.py | 38 +++++++++++++----------- 3 files changed, 59 insertions(+), 54 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 01154cfcf9a..72559203c04 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -555,27 +555,6 @@ def is_illegal_value_for_enum(value, enum_class): return value not in enum_class.__members__.values() # ----------------------------download a file----------------------------------------------------- - -# ----------------------------start parsing----------------------------------------------------- - -# ----------------------------stop parsing----------------------------------------------------- - -# ----------------------------show the status of the file----------------------------------------------------- - -# ----------------------------list the chunks of the file----------------------------------------------------- - -# -- --------------------------delete the chunk----------------------------------------------------- - -# ----------------------------edit the status of the chunk----------------------------------------------------- - -# ----------------------------insert a new chunk----------------------------------------------------- - -# ----------------------------upload a file----------------------------------------------------- - -# ----------------------------get a specific chunk----------------------------------------------------- - -# ----------------------------retrieval test----------------------------------------------------- - @manager.route("//documents/", methods=["GET"]) @login_required def download_documents(dataset_id, document_id): @@ -594,7 +573,8 @@ def download_documents(dataset_id, document_id): # The process of downloading b, n = File2DocumentService.get_minio_address(doc_id=document_id) # minio address response = make_response(MINIO.get(b, n)) - extension = re.search(r"\.([^.]+)$", document.name) + filename = document.name + extension = re.search(r"\.([^.]+)$", filename) if extension: if document.type == FileType.VISUAL.value: response.headers.set('Content-Type', 'image/%s' % extension.group(1)) @@ -603,13 +583,32 @@ def download_documents(dataset_id, document_id): 'Content-Type', 'application/%s' % extension.group(1)) - print("---response----", response) - print("---response json----", response.data) + base64_encoded = base64.b64encode(response.data).decode('utf-8') - print("---base64----", base64_encoded) # Download successfully - return construct_json_result(code=RetCode.SUCCESS, data=base64_encoded) + return construct_json_result(code=RetCode.SUCCESS, data={"filename": filename, "encoded_data": base64_encoded}) # Error except Exception as e: return construct_error_response(e) +# ----------------------------start parsing----------------------------------------------------- + +# ----------------------------stop parsing----------------------------------------------------- + +# ----------------------------show the status of the file----------------------------------------------------- + +# ----------------------------list the chunks of the file----------------------------------------------------- + +# -- --------------------------delete the chunk----------------------------------------------------- + +# ----------------------------edit the status of the chunk----------------------------------------------------- + +# ----------------------------insert a new chunk----------------------------------------------------- + +# ----------------------------upload a file----------------------------------------------------- + +# ----------------------------get a specific chunk----------------------------------------------------- + +# ----------------------------retrieval test----------------------------------------------------- + + diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index 46e3aa98933..88bdc0ca542 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -126,7 +126,20 @@ def update_file(self, dataset_id, document_id, **params): return response.json() # ----------------------------download a file----------------------------------------------------- + def download_file(self, dataset_id, document_id): + # whether path is string + endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}" + res = requests.get(endpoint, headers=self.authorization_header) + json_data = res.json() + if json_data["data"]: + base64_encoded = json_data["data"]["encoded_data"] + file_path = os.path.join(os.getcwd(), json_data["data"]["filename"]) + binary_data = base64.b64decode(base64_encoded) + with open(file_path, "wb") as file: + file.write(binary_data) + return {"code": RetCode.SUCCESS, "data": binary_data} + return res.json() # ----------------------------start parsing----------------------------------------------------- # ----------------------------stop parsing----------------------------------------------------- @@ -144,13 +157,4 @@ def update_file(self, dataset_id, document_id, **params): # ----------------------------get a specific chunk----------------------------------------------------- # ----------------------------retrieval test----------------------------------------------------- - def download_file(self, dataset_id, document_id): - # whether path is string - endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}" - res = requests.get(endpoint, headers=self.authorization_header) - json_data = res.json() - if json_data["data"]: - base64_encoded = json_data["data"] - binary_data = base64.b64decode(base64_encoded) - return {"code": RetCode.SUCCESS, "data": binary_data} - return res.json() + diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index d81afd6b92e..78d3e0bd032 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -625,25 +625,9 @@ def test_update_document_with_giving_illegal_value_for_type(self): update_res = ragflow.update_file(created_res_id, doc_id, **params) assert (update_res["code"] == RetCode.DATA_ERROR and update_res["message"] == "Illegal value ? for 'template_type' field.") -# ----------------------------download a file----------------------------------------------------- - -# ----------------------------start parsing----------------------------------------------------- - -# ----------------------------stop parsing----------------------------------------------------- - -# ----------------------------show the status of the file----------------------------------------------------- - -# ----------------------------list the chunks of the file----------------------------------------------------- -# ----------------------------delete the chunk----------------------------------------------------- - -# ----------------------------edit the status of the chunk----------------------------------------------------- - -# ----------------------------insert a new chunk----------------------------------------------------- - -# ----------------------------get a specific chunk----------------------------------------------------- +# ----------------------------download a file----------------------------------------------------- -# ----------------------------retrieval test----------------------------------------------------- def test_download_nonexistent_document(self): """ Test downloading a document which does not exist. @@ -711,4 +695,22 @@ def test_download_an_empty_document_with_success(self): doc_id = data["id"] # download file res = ragflow.download_file(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["data"] == "" + assert res["code"] == RetCode.SUCCESS and res["data"] == b"" + +# ----------------------------start parsing----------------------------------------------------- + +# ----------------------------stop parsing----------------------------------------------------- + +# ----------------------------show the status of the file----------------------------------------------------- + +# ----------------------------list the chunks of the file----------------------------------------------------- + +# ----------------------------delete the chunk----------------------------------------------------- + +# ----------------------------edit the status of the chunk----------------------------------------------------- + +# ----------------------------insert a new chunk----------------------------------------------------- + +# ----------------------------get a specific chunk----------------------------------------------------- + +# ----------------------------retrieval test----------------------------------------------------- From eee518f0927f5c352ed2ed18b7dea4e1dbd1b733 Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Thu, 4 Jul 2024 12:04:05 +0800 Subject: [PATCH 4/6] download from two ends successfully --- api/apps/dataset_api.py | 48 ++++++++++++++++++-------------- sdk/python/ragflow/ragflow.py | 21 ++++++++------ sdk/python/test/common.py | 2 +- sdk/python/test/test_document.py | 12 ++++---- 4 files changed, 45 insertions(+), 38 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 72559203c04..8c235588cd7 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -17,10 +17,12 @@ import pathlib import re import warnings +from io import BytesIO -from flask import request, make_response, send_from_directory +from flask import request, make_response, send_file from flask_login import login_required, current_user from httpx import HTTPError +from minio import S3Error from api.contants import NAME_LENGTH_LIMIT from api.db import FileType, ParserType, FileSource @@ -557,39 +559,43 @@ def is_illegal_value_for_enum(value, enum_class): # ----------------------------download a file----------------------------------------------------- @manager.route("//documents/", methods=["GET"]) @login_required -def download_documents(dataset_id, document_id): +def download_document(dataset_id, document_id): try: # Check whether there is this dataset - exist, dataset = KnowledgebaseService.get_by_id(dataset_id) + exist, _ = KnowledgebaseService.get_by_id(dataset_id) if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset {dataset_id} cannot be found!") + return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset '{dataset_id}' cannot be found!") # Check whether there is this document exist, document = DocumentService.get_by_id(document_id) if not exist: - return construct_json_result(message=f"This document {document_id} cannot be found!", + return construct_json_result(message=f"This document '{document_id}' cannot be found!", code=RetCode.ARGUMENT_ERROR) # The process of downloading - b, n = File2DocumentService.get_minio_address(doc_id=document_id) # minio address - response = make_response(MINIO.get(b, n)) - filename = document.name - extension = re.search(r"\.([^.]+)$", filename) - if extension: - if document.type == FileType.VISUAL.value: - response.headers.set('Content-Type', 'image/%s' % extension.group(1)) - else: - response.headers.set( - 'Content-Type', - 'application/%s' % - extension.group(1)) - - base64_encoded = base64.b64encode(response.data).decode('utf-8') - # Download successfully - return construct_json_result(code=RetCode.SUCCESS, data={"filename": filename, "encoded_data": base64_encoded}) + doc_id, doc_location = File2DocumentService.get_minio_address(doc_id=document_id) # minio address + file_stream = MINIO.get(doc_id, doc_location) + if not file_stream: + return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR) + + file = BytesIO(file_stream) + + # Use send_file with a proper filename and MIME type + try: + return send_file( + file, + as_attachment=True, + download_name=document.name, + mimetype='application/octet-stream' # Set a default MIME type + ) + except S3Error as e: + # Handle the error from MinIO + return construct_json_result(code=RetCode.SERVER_ERROR, message=str(e)) + # Error except Exception as e: return construct_error_response(e) + # ----------------------------start parsing----------------------------------------------------- # ----------------------------stop parsing----------------------------------------------------- diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index 88bdc0ca542..efa271b17e9 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -18,6 +18,7 @@ import requests +from api.db.services.document_service import DocumentService from api.settings import RetCode @@ -127,19 +128,21 @@ def update_file(self, dataset_id, document_id, **params): # ----------------------------download a file----------------------------------------------------- def download_file(self, dataset_id, document_id): - # whether path is string endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}" res = requests.get(endpoint, headers=self.authorization_header) - json_data = res.json() - if json_data["data"]: - base64_encoded = json_data["data"]["encoded_data"] - file_path = os.path.join(os.getcwd(), json_data["data"]["filename"]) - binary_data = base64.b64decode(base64_encoded) + content = res.content # binary data + # decode the binary data + try: + decoded_content = content.decode("utf-8") + json_data = json.loads(decoded_content) + return json_data # message + except json.JSONDecodeError: # binary data + _, document = DocumentService.get_by_id(document_id) + file_path = os.path.join(os.getcwd(), document.name) with open(file_path, "wb") as file: - file.write(binary_data) - return {"code": RetCode.SUCCESS, "data": binary_data} - return res.json() + file.write(content) + return {"code": RetCode.SUCCESS, "data": content} # ----------------------------start parsing----------------------------------------------------- # ----------------------------stop parsing----------------------------------------------------- diff --git a/sdk/python/test/common.py b/sdk/python/test/common.py index 5dd313f5072..94acbf48cab 100644 --- a/sdk/python/test/common.py +++ b/sdk/python/test/common.py @@ -1,4 +1,4 @@ -API_KEY = 'ImFhMmJhZmUwMmQxNzExZWZhZDdmMzA0M2Q3ZWU1MzdlIg.ZnDsIQ.u-0-_qCRU6a4WICxyAPsjaafyOo' +API_KEY = 'IjJkOGQ4ZDE2MzkyMjExZWZhYTk0MzA0M2Q3ZWU1MzdlIg.ZoUfug.RmqcYyCrlAnLtkzk6bYXiXN3eEY' HOST_ADDRESS = 'http://127.0.0.1:9380' \ No newline at end of file diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index 78d3e0bd032..25f7093500c 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -637,8 +637,7 @@ def test_download_nonexistent_document(self): created_res = ragflow.create_dataset("test_download_nonexistent_document") created_res_id = created_res["data"]["dataset_id"] res = ragflow.download_file(created_res_id, "imagination") - print(res) - assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document imagination cannot be found!" + assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document 'imagination' cannot be found!" def test_download_document_in_nonexistent_dataset(self): """ @@ -656,8 +655,7 @@ def test_download_document_in_nonexistent_dataset(self): doc_id = data["id"] # download file res = ragflow.download_file("imagination", doc_id) - - assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"This dataset imagination cannot be found!" + assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"This dataset 'imagination' cannot be found!" def test_download_document_with_success(self): """ @@ -679,9 +677,9 @@ def test_download_document_with_success(self): res = ragflow.download_file(created_res_id, doc_id) assert res["code"] == RetCode.SUCCESS and res["data"] == binary_data - def test_download_an_empty_document_with_success(self): + def test_download_an_empty_document(self): """ - Test the downloading of an empty document with success. + Test the downloading of an empty document. """ # create a dataset ragflow = RAGFlow(API_KEY, HOST_ADDRESS) @@ -695,7 +693,7 @@ def test_download_an_empty_document_with_success(self): doc_id = data["id"] # download file res = ragflow.download_file(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["data"] == b"" + assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty." # ----------------------------start parsing----------------------------------------------------- From 8a97333d59a5c54bb35612a8bf73f530971ba9e8 Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Thu, 4 Jul 2024 13:42:40 +0800 Subject: [PATCH 5/6] optimize imports --- api/apps/dataset_api.py | 3 +-- sdk/python/ragflow/ragflow.py | 1 - sdk/python/test/test_document.py | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 8c235588cd7..349721cdabe 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -12,14 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import base64 import os import pathlib import re import warnings from io import BytesIO -from flask import request, make_response, send_file +from flask import request, send_file from flask_login import login_required, current_user from httpx import HTTPError from minio import S3Error diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index efa271b17e9..6275f921c3c 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import base64 import json import os diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index 25f7093500c..f7f87a148b8 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -3,7 +3,6 @@ from ragflow import RAGFlow import pytest from common import API_KEY, HOST_ADDRESS -from api.contants import NAME_LENGTH_LIMIT class TestFile(TestSdk): From 02bc56e7256635f50d4279d13fed3570366718a6 Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Thu, 4 Jul 2024 15:36:51 +0800 Subject: [PATCH 6/6] delete error condition --- api/apps/dataset_api.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 349721cdabe..3b290630bdd 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -580,16 +580,12 @@ def download_document(dataset_id, document_id): file = BytesIO(file_stream) # Use send_file with a proper filename and MIME type - try: - return send_file( - file, - as_attachment=True, - download_name=document.name, - mimetype='application/octet-stream' # Set a default MIME type - ) - except S3Error as e: - # Handle the error from MinIO - return construct_json_result(code=RetCode.SERVER_ERROR, message=str(e)) + return send_file( + file, + as_attachment=True, + download_name=document.name, + mimetype='application/octet-stream' # Set a default MIME type + ) # Error except Exception as e: