From 9f8899b49d3acd01eaa8a7f51e25ed40f90f9f7e Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Thu, 21 Nov 2024 10:52:06 +0800 Subject: [PATCH] add test case for DocManager server and some minor changes --- examples/rag_milvus_store.py | 2 +- lazyllm/tools/rag/map_store.py | 7 -- tests/basic_tests/test_document.py | 99 ++++++++++++++++++++--------- tests/basic_tests/test_rag_utils.py | 35 ++++++++++ 4 files changed, 104 insertions(+), 39 deletions(-) diff --git a/examples/rag_milvus_store.py b/examples/rag_milvus_store.py index c2a23c9b..8bce49ea 100644 --- a/examples/rag_milvus_store.py +++ b/examples/rag_milvus_store.py @@ -8,7 +8,7 @@ class TmpDir: def __init__(self): - self.root_dir = os.path.expanduser(os.path.join(config['home'], 'rag_for_ut')) + self.root_dir = os.path.expanduser(os.path.join(config['home'], 'rag_for_example_ut')) self.rag_dir = os.path.join(self.root_dir, 'rag_master') os.makedirs(self.rag_dir, exist_ok=True) self.store_file = os.path.join(self.root_dir, "milvus.db") diff --git a/lazyllm/tools/rag/map_store.py b/lazyllm/tools/rag/map_store.py index fa062102..39068f4a 100644 --- a/lazyllm/tools/rag/map_store.py +++ b/lazyllm/tools/rag/map_store.py @@ -87,10 +87,3 @@ def get_index(self, type: Optional[str] = None) -> Optional[IndexBase]: if type is None: type = 'default' return self._name2index.get(type) - - def find_node_by_uid(self, uid: str) -> Optional[DocNode]: - for docs in self._group2docs.values(): - doc = docs.get(uid) - if doc: - return doc - return None diff --git a/tests/basic_tests/test_document.py b/tests/basic_tests/test_document.py index 72535c11..816aa403 100644 --- a/tests/basic_tests/test_document.py +++ b/tests/basic_tests/test_document.py @@ -1,14 +1,23 @@ import lazyllm from lazyllm.tools.rag.doc_impl import DocImpl -from lazyllm.tools.rag.utils import _FileNodeIndex from lazyllm.tools.rag.transform import SentenceSplitter from lazyllm.tools.rag.store_base import LAZY_ROOT_NAME from lazyllm.tools.rag.doc_node import DocNode -from lazyllm.tools.rag.global_metadata import RAG_DOC_PATH +from lazyllm.tools.rag.global_metadata import RAG_DOC_PATH, RAG_DOC_ID from lazyllm.tools.rag import Document, Retriever, TransformArgs, AdaptiveTransform +from lazyllm.tools.rag.doc_manager import DocManager +from lazyllm.tools.rag.utils import DocListManager from lazyllm.launcher import cleanup +from lazyllm import config from unittest.mock import MagicMock import unittest +import httpx +import os +import shutil +import io +import re +import json +import time class TestDocImpl(unittest.TestCase): @@ -167,36 +176,64 @@ def test_doc_web_module(self): assert response.status_code == 200 doc.stop() -class TestFileNodeIndex(unittest.TestCase): +class TmpDir: + def __init__(self): + self.root_dir = os.path.expanduser(os.path.join(config['home'], 'rag_for_document_ut')) + self.rag_dir = os.path.join(self.root_dir, 'rag_master') + os.makedirs(self.rag_dir, exist_ok=True) + + def __del__(self): + shutil.rmtree(self.root_dir) + +class TestDocumentServer(unittest.TestCase): def setUp(self): - self.index = _FileNodeIndex() - self.node1 = DocNode(uid='1', group=LAZY_ROOT_NAME, global_metadata={RAG_DOC_PATH: "d1"}) - self.node2 = DocNode(uid='2', group=LAZY_ROOT_NAME, global_metadata={RAG_DOC_PATH: "d2"}) - self.files = [self.node1.global_metadata[RAG_DOC_PATH], self.node2.global_metadata[RAG_DOC_PATH]] - - def test_update(self): - self.index.update([self.node1, self.node2]) - - nodes = self.index.query(self.files) - assert len(nodes) == len(self.files) - - ret = [node.global_metadata[RAG_DOC_PATH] for node in nodes] - assert set(ret) == set(self.files) - - def test_remove(self): - self.index.update([self.node1, self.node2]) - self.index.remove([self.node2.uid]) - ret = self.index.query([self.node2.global_metadata[RAG_DOC_PATH]]) - assert len(ret) == 0 - - def test_query(self): - self.index.update([self.node1, self.node2]) - ret = self.index.query([self.node2.global_metadata[RAG_DOC_PATH]]) - assert len(ret) == 1 - assert ret[0] is self.node2 - ret = self.index.query([self.node1.global_metadata[RAG_DOC_PATH]]) - assert len(ret) == 1 - assert ret[0] is self.node1 + self.dir = TmpDir() + self.dlm = DocListManager(path=self.dir.rag_dir, name=None) + self.dlm.init_tables() + + self.doc_impl = DocImpl(embed=MagicMock(), dlm=self.dlm) + self.doc_impl._lazy_init() + + self.server = lazyllm.ServerModule(DocManager(self.dlm)) + self.server.start() + + url_pattern = r'(http://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+)' + self.doc_server_addr = re.findall(url_pattern, self.server._url)[0] + + def test_delete_files_in_store(self): + files = [('files', ('test1.txt', io.BytesIO(b"John's house is in Beijing"), 'text/palin')), + ('files', ('test2.txt', io.BytesIO(b"John's house is in Shanghai"), 'text/plain'))] + metadatas = [{"comment": "comment1"}, {"signature": "signature2"}] + params = dict(override='true', metadatas=json.dumps(metadatas)) + + url = f'{self.doc_server_addr}/upload_files' + response = httpx.post(url, params=params, files=files, timeout=10) + assert response.status_code == 200 and response.json().get('code') == 200, response.json() + ids = response.json().get('data')[0] + lazyllm.LOG.error(f'debug!!! ids -> {ids}') + assert len(ids) == 2 + + time.sleep(20) # waiting for worker thread to update newly uploaded files + + # make sure that ids are written into the store + nodes = self.doc_impl.store.get_nodes(LAZY_ROOT_NAME) + for node in nodes: + if node.global_metadata[RAG_DOC_PATH].endswith('test1.txt'): + test1_docid = node.global_metadata[RAG_DOC_ID] + elif node.global_metadata[RAG_DOC_PATH].endswith('test2.txt'): + test2_docid = node.global_metadata[RAG_DOC_ID] + assert test1_docid and test2_docid + assert set([test1_docid, test2_docid]) == set(ids) + + url = f'{self.doc_server_addr}/delete_files' + response = httpx.post(url, json=dict(file_ids=[test1_docid])) + assert response.status_code == 200 and response.json().get('code') == 200 + + time.sleep(20) # waiting for worker thread to delete files + + nodes = self.doc_impl.store.get_nodes(LAZY_ROOT_NAME) + assert len(nodes) == 1 + assert nodes[0].global_metadata[RAG_DOC_ID] == test2_docid if __name__ == "__main__": unittest.main() diff --git a/tests/basic_tests/test_rag_utils.py b/tests/basic_tests/test_rag_utils.py index ba9adbf5..ce8c4283 100644 --- a/tests/basic_tests/test_rag_utils.py +++ b/tests/basic_tests/test_rag_utils.py @@ -1,5 +1,9 @@ from lazyllm.tools.rag.utils import generic_process_filters from lazyllm.tools.rag.doc_node import DocNode +from lazyllm.tools.rag.utils import _FileNodeIndex +from lazyllm.tools.rag.store_base import LAZY_ROOT_NAME +from lazyllm.tools.rag.global_metadata import RAG_DOC_PATH +import unittest class TestRagUtils: def test_generic_process_filters(self): @@ -19,3 +23,34 @@ def test_generic_process_filters(self): res = generic_process_filters(nodes, {'k2': 'v6'}) assert len(res) == 0 + +class TestFileNodeIndex(unittest.TestCase): + def setUp(self): + self.index = _FileNodeIndex() + self.node1 = DocNode(uid='1', group=LAZY_ROOT_NAME, global_metadata={RAG_DOC_PATH: "d1"}) + self.node2 = DocNode(uid='2', group=LAZY_ROOT_NAME, global_metadata={RAG_DOC_PATH: "d2"}) + self.files = [self.node1.global_metadata[RAG_DOC_PATH], self.node2.global_metadata[RAG_DOC_PATH]] + + def test_update(self): + self.index.update([self.node1, self.node2]) + + nodes = self.index.query(self.files) + assert len(nodes) == len(self.files) + + ret = [node.global_metadata[RAG_DOC_PATH] for node in nodes] + assert set(ret) == set(self.files) + + def test_remove(self): + self.index.update([self.node1, self.node2]) + self.index.remove([self.node2.uid]) + ret = self.index.query([self.node2.global_metadata[RAG_DOC_PATH]]) + assert len(ret) == 0 + + def test_query(self): + self.index.update([self.node1, self.node2]) + ret = self.index.query([self.node2.global_metadata[RAG_DOC_PATH]]) + assert len(ret) == 1 + assert ret[0] is self.node2 + ret = self.index.query([self.node1.global_metadata[RAG_DOC_PATH]]) + assert len(ret) == 1 + assert ret[0] is self.node1