Skip to content

Commit

Permalink
Add test for document (infiniflow#3548)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Add test for document

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Co-authored-by: liuhua <[email protected]>
  • Loading branch information
Feiue and liuhua authored Nov 21, 2024
1 parent 0ac6dc8 commit c7c8b38
Show file tree
Hide file tree
Showing 21 changed files with 406 additions and 31 deletions.
2 changes: 2 additions & 0 deletions api/apps/sdk/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def upload(dataset_id, tenant_id):
return get_result(
message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR
)
'''
# total size
total_size = 0
for file_obj in file_objs:
Expand All @@ -127,6 +128,7 @@ def upload(dataset_id, tenant_id):
message=f"Total file size exceeds 10MB limit! ({total_size / (1024 * 1024):.2f} MB)",
code=settings.RetCode.ARGUMENT_ERROR,
)
'''
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if not e:
raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
Expand Down
File renamed without changes.
File renamed without changes.
110 changes: 107 additions & 3 deletions sdk/python/test/t_document.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from ragflow_sdk import RAGFlow, DataSet, Document, Chunk
from ragflow_sdk import RAGFlow
from common import HOST_ADDRESS

import pytest

def test_upload_document_with_success(get_api_key_fixture):
API_KEY = get_api_key_fixture
Expand Down Expand Up @@ -48,7 +48,6 @@ def test_list_documents_in_dataset_with_success(get_api_key_fixture):
ds.list_documents(keywords="test", page=1, page_size=12)



def test_delete_documents_in_dataset_with_success(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
Expand All @@ -59,4 +58,109 @@ def test_delete_documents_in_dataset_with_success(get_api_key_fixture):
docs = ds.upload_documents(document_infos)
ds.delete_documents([docs[0].id])

# upload and parse the document with different in different parse method.
def test_upload_and_parse_pdf_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_pdf_document")
with open("test_data/test.pdf","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.pdf","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])

def test_upload_and_parse_docx_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_docx_document")
with open("test_data/test.docx","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.docx","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])
def test_upload_and_parse_excel_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_excel_document")
with open("test_data/test.xlsx","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.xlsx","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])
def test_upload_and_parse_ppt_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_ppt_document")
with open("test_data/test.ppt","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.ppt","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])
def test_upload_and_parse_image_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_image_document")
with open("test_data/test.jpg","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.jpg","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])
def test_upload_and_parse_txt_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_txt_document")
with open("test_data/test.txt","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.txt","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])
def test_upload_and_parse_md_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_md_document")
with open("test_data/test.md","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.md","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])

def test_upload_and_parse_json_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_json_document")
with open("test_data/test.json","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.json","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])

@pytest.mark.skip(reason="")
def test_upload_and_parse_eml_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_eml_document")
with open("test_data/test.eml","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.eml","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])

def test_upload_and_parse_html_documents_with_general_parse_method(get_api_key_fixture):
API_KEY = get_api_key_fixture
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="test_html_document")
with open("test_data/test.html","rb") as file:
blob=file.read()
document_infos = [{"displayed_name": "test.html","blob": blob}]
docs=ds.upload_documents(document_infos)
doc = docs[0]
ds.async_parse_documents([doc.id])
2 changes: 0 additions & 2 deletions sdk/python/test/test_data/.txt

This file was deleted.

Empty file.
3 changes: 0 additions & 3 deletions sdk/python/test/test_data/lol.txt

This file was deleted.

8 changes: 0 additions & 8 deletions sdk/python/test/test_data/story.txt

This file was deleted.

Binary file added sdk/python/test/test_data/test.docx
Binary file not shown.
Loading

0 comments on commit c7c8b38

Please sign in to comment.