diff --git a/lazyllm/common/bind.py b/lazyllm/common/bind.py index 1a150292..49780af2 100644 --- a/lazyllm/common/bind.py +++ b/lazyllm/common/bind.py @@ -78,6 +78,8 @@ def __getitem__(self, key): return self def __getattr__(self, key): + if key.startswith('__') and key.endswith('__'): + raise AttributeError(f'Args has no attribute {key}') self._attr_key = key return self diff --git a/lazyllm/docs/tools.py b/lazyllm/docs/tools.py index 499f8809..6e335870 100644 --- a/lazyllm/docs/tools.py +++ b/lazyllm/docs/tools.py @@ -156,8 +156,8 @@ >>> doc1 = Document(dataset_path="your_files_path", create_ui=False) >>> doc2 = Document(dataset_path="your_files_path", create_ui=False) >>> files = ["your_yml_files"] ->>> docs1 = doc1._impl._impl.directory_reader.load_data(input_files=files) ->>> docs2 = doc2._impl._impl.directory_reader.load_data(input_files=files) +>>> docs1 = doc1._impl._reader.load_data(input_files=files) +>>> docs2 = doc2._impl._reader.load_data(input_files=files) >>> print(docs1[0].text == docs2[0].text) # True ''') @@ -207,9 +207,9 @@ # {} >>> files = ["your_yml_files"] >>> Document.register_global_reader("**/*.yml", processYml) ->>> doc1._impl._impl.directory_reader.load_data(input_files=files) +>>> doc1._impl._reader.load_data(input_files=files) # Call the class YmlReader. ->>> doc2._impl._impl.directory_reader.load_data(input_files=files) +>>> doc2._impl._reader.load_data(input_files=files) # Call the function processYml. ''') @@ -251,7 +251,7 @@ ... >>> files = ["your_yml_files"] >>> doc = Document(dataset_path="your_files_path", create_ui=False) ->>> reader = doc._impl._impl.directory_reader.load_data(input_files=files) +>>> reader = doc._impl._reader.load_data(input_files=files) # Call the class YmlReader. ''') diff --git a/lazyllm/tools/rag/data_loaders.py b/lazyllm/tools/rag/data_loaders.py index 61d0392a..02a11c9a 100644 --- a/lazyllm/tools/rag/data_loaders.py +++ b/lazyllm/tools/rag/data_loaders.py @@ -4,7 +4,7 @@ from .dataReader import SimpleDirectoryReader class DirectoryReader: - def __init__(self, input_files: List[str], local_readers: Optional[Dict] = None, + def __init__(self, input_files: Optional[List[str]], local_readers: Optional[Dict] = None, global_readers: Optional[Dict] = None) -> None: self._input_files = input_files self._local_readers = local_readers diff --git a/lazyllm/tools/rag/doc_impl.py b/lazyllm/tools/rag/doc_impl.py index a1cf190a..10b31d45 100644 --- a/lazyllm/tools/rag/doc_impl.py +++ b/lazyllm/tools/rag/doc_impl.py @@ -8,6 +8,7 @@ from .store import MapStore, DocNode, ChromadbStore, LAZY_ROOT_NAME, BaseStore from .data_loaders import DirectoryReader from .index import DefaultIndex +import os _transmap = dict(function=FuncNodeTransform, sentencesplitter=SentenceSplitter, llm=LLMParser) @@ -25,13 +26,19 @@ def wrapper(*args, **kwargs) -> List[float]: class DocImpl: - _builtin_node_groups = {} - _global_node_groups = {} + DEDAULT_GROUP_NAME = '__default__' + _builtin_node_groups: Dict[str, Dict] = {} + _global_node_groups: Dict[str, Dict] = {} + _registered_file_reader: Dict[str, Callable] = {} - def __init__(self, embed: Dict[str, Callable], doc_files=Optional[List[str]], - local_readers: Optional[Dict] = None, global_readers: Optional[Dict] = None, **kwargs): + def __init__(self, embed: Dict[str, Callable], dataset_path: Optional[str] = None, + doc_files: Optional[str] = None, kb_group_name: str = None): super().__init__() - self.directory_reader = DirectoryReader(doc_files, local_readers=local_readers, global_readers=global_readers) + assert (dataset_path is None) ^ (doc_files is None), 'Only one of dataset_path or doc_files should be provided' + self._local_file_reader: Dict[str, Callable] = {} + self._kb_group_name = kb_group_name or DocImpl.DEDAULT_GROUP_NAME + self._dataset_path, self._doc_files = dataset_path, doc_files + self._reader = DirectoryReader(None, self._local_file_reader, DocImpl._registered_file_reader) self.node_groups: Dict[str, Dict] = {LAZY_ROOT_NAME: {}} self.embed = {k: embed_wrapper(e) for k, e in embed.items()} self.store = None @@ -46,7 +53,7 @@ def _lazy_init(self) -> None: self.store = self._get_store() self.index = DefaultIndex(self.embed, self.store) if not self.store.has_nodes(LAZY_ROOT_NAME): - root_nodes = self.directory_reader.load_data() + root_nodes = self._reader.load_data(self._list_files()) self.store.add_nodes(root_nodes) LOG.debug(f"building {LAZY_ROOT_NAME} nodes: {root_nodes}") @@ -115,11 +122,48 @@ def create_node_group(self, name, transform: Union[str, Callable] = None, parent DocImpl._create_node_group_impl(self, 'node_groups', name=name, transform=transform, parent=parent, trans_node=trans_node, num_workers=num_workers, **kwargs) - def add_files(self, input_files: List[str]) -> None: + @classmethod + def register_global_reader(cls, pattern: str, func: Optional[Callable] = None): + if func is not None: + cls._registered_file_reader[pattern] = func + return None + + def decorator(klass): + if callable(klass): cls._registered_file_reader[pattern] = klass + else: raise TypeError(f"The registered object {klass} is not a callable object.") + return klass + return decorator + + def add_reader(self, pattern: str, func: Optional[Callable] = None): + assert callable(func), 'func for reader should be callable' + self._local_file_reader[pattern] = func + + # TODO(wangzhihong): modify here to fit kb-groups + def _list_files(self) -> List[str]: + if self._doc_files: return self._doc_files + if not os.path.isabs(self._dataset_path): + raise ValueError("directory must be an absolute path") + + path = (os.path.join(self._dataset_path, self._kb_group_name) + if self._kb_group_name != DocImpl.DEDAULT_GROUP_NAME else self._dataset_path) + + try: + files_list = [] + + for root, _, files in os.walk(path): + files = [os.path.join(root, file_path) for file_path in files] + files_list.extend(files) + + return files_list + except Exception as e: + LOG.error(f"Error while listing files in {path}: {e}") + return [] + + def _add_files(self, input_files: List[str]): if len(input_files) == 0: return self._lazy_init() - root_nodes = self.directory_reader.load_data(input_files) + root_nodes = self._reader.load_data(input_files) temp_store = self._get_store() temp_store.add_nodes(root_nodes) active_groups = self.store.active_groups() @@ -130,7 +174,7 @@ def add_files(self, input_files: List[str]) -> None: self.store.add_nodes(nodes) LOG.debug(f"Merge {group} with {nodes}") - def delete_files(self, input_files: List[str]) -> None: + def _delete_files(self, input_files: List[str]) -> None: self._lazy_init() docs = self.store.get_nodes_by_files(input_files) LOG.info(f"delete_files: removing documents {input_files} and nodes {docs}") @@ -186,7 +230,8 @@ def retrieve(self, query: str, group_name: str, similarity: str, similarity_cut_ query, nodes, similarity, similarity_cut_off, topk, embed_keys, **similarity_kws ) - def find_parent(self, nodes: List[DocNode], group: str) -> List[DocNode]: + @staticmethod + def find_parent(nodes: List[DocNode], group: str) -> List[DocNode]: def recurse_parents(node: DocNode, visited: Set[DocNode]) -> None: if node.parent: if node.parent.group == group: @@ -203,13 +248,8 @@ def recurse_parents(node: DocNode, visited: Set[DocNode]) -> None: LOG.debug(f"Found parent node for {group}: {result}") return list(result) - def find_children(self, nodes: List[DocNode], group: str) -> List[DocNode]: - active_groups = self.store.active_groups() - if group not in active_groups: - raise ValueError( - f"group {group} not found in active groups {active_groups}, please retrieve the group first." - ) - + @staticmethod + def find_children(nodes: List[DocNode], group: str) -> List[DocNode]: def recurse_children(node: DocNode, visited: Set[DocNode]) -> bool: if group in node.children: visited.update(node.children[group]) diff --git a/lazyllm/tools/rag/doc_manager.py b/lazyllm/tools/rag/doc_manager.py index 1b39ae47..e6de4e03 100644 --- a/lazyllm/tools/rag/doc_manager.py +++ b/lazyllm/tools/rag/doc_manager.py @@ -6,65 +6,41 @@ import lazyllm from lazyllm import FastapiApp as app -from .group_doc import DocGroupImpl -from .utils import BaseResponse, save_files_in_threads +from .utils import BaseResponse class DocManager(lazyllm.ModuleBase): - def __init__(self, doc_impl: DocGroupImpl) -> None: + def __init__(self, root: str) -> None: super().__init__() - self._impl = doc_impl + self._root = root @app.get("/", response_model=BaseResponse, summary="docs") def document(self): return RedirectResponse(url="/docs") - @app.post("/new_group") - def new_group(self, group_name: str): - self._impl.new_group(group_name) - return BaseResponse(msg=f"create {group_name} success") - - @app.post("/delete_group") - def delete_group(self, group_name: str): - self._impl.delete_group(group_name) - return BaseResponse(msg=f"delete {group_name} success") - - @app.get("/list_groups") - def list_groups(self): - gourp_list = self._impl.list_groups() - return BaseResponse(data=gourp_list) + @app.get("/list_kb_groups") + def list_kb_groups(self): + pass @app.post("/upload_files") - def upload_files(self, files: List[UploadFile], group_name: str, override: bool): - already_exist_files, new_add_files, overwritten_files = save_files_in_threads( - files=files, - source_path=self._impl.get_group_source_path(group_name=group_name), - override=override, - ) - - self._impl.delete_files(group_name, overwritten_files, is_del_source=False) - self._impl.add_files(group_name, new_add_files + overwritten_files) - return BaseResponse( - data={ - "already_exist_files": already_exist_files, - "new_add_files": new_add_files, - "overwritten_files": overwritten_files, - } - ) + def upload_files(self, files: List[UploadFile], override: bool): + pass @app.get("/list_files") def list_files(self, group_name: str): - file_list = self._impl.list_files(group_name) - return BaseResponse(data=file_list) + pass + + @app.post("/add_files_to_group") + def add_files_to_group(self, files: List[UploadFile], group_name: str): + pass - @app.post("/delete_file") - def delete_file(self, group_name: str, file_name: str): - self._impl.delete_files(group_name, files=[file_name]) - return BaseResponse(msg=f"delete {file_name} success") + @app.post("/delete_files") + def delete_file(self, file_names: str): + pass - def forward(self, *args, **kwargs): - func_name = kwargs.pop("func_name") - return getattr(self._impl, func_name)(*args, **kwargs) + @app.post("/delete_files_from_group") + def delete_files_from_group(self, group_name: str, file_names: str): + pass def __repr__(self): return lazyllm.make_repr("Module", "DocManager") diff --git a/lazyllm/tools/rag/document.py b/lazyllm/tools/rag/document.py index dbee5924..27473d12 100644 --- a/lazyllm/tools/rag/document.py +++ b/lazyllm/tools/rag/document.py @@ -1,58 +1,53 @@ -from functools import partial import os -from typing import Callable, Optional, Dict, Union +from typing import Callable, Optional, Dict, Union, List import lazyllm from lazyllm import ModuleBase, ServerModule, DynamicDescriptor -from .web import DocWebModule from .doc_manager import DocManager -from .group_doc import DocGroupImpl, DocImpl -from .store import LAZY_ROOT_NAME, EMBED_DEFAULT_KEY +from .doc_impl import DocImpl +from .store import LAZY_ROOT_NAME, EMBED_DEFAULT_KEY, DocNode +import copy class Document(ModuleBase): - _registered_file_reader: Dict[str, Callable] = {} + class _Impl(ModuleBase): + def __init__(self, dataset_path: str, embed: Optional[Union[Callable, Dict[str, Callable]]] = None, + manager: bool = False, server: bool = False, launcher=None): + super().__init__() + if not os.path.exists(dataset_path): + defatult_path = os.path.join(lazyllm.config["data_path"], dataset_path) + if os.path.exists(defatult_path): + dataset_path = defatult_path + launcher = launcher if launcher else lazyllm.launchers.remote(sync=False) + self._dataset_path = dataset_path + self._embed = embed if isinstance(embed, dict) else {EMBED_DEFAULT_KEY: embed} + for embed in self._embed.values(): + if isinstance(embed, ModuleBase): + self._submodules.append(embed) + self._kbs = {DocImpl.DEDAULT_GROUP_NAME: DocImpl(dataset_path=dataset_path, embed=self._embed)} + if manager: self._manager = DocManager(dataset_path) + if server: self._doc = ServerModule(self._doc) + + def add_kb_group(self, name): self._kbs[name] = DocImpl(dataset_path=self.dataset_path, embed=self._embed) + def get_doc_by_kb_group(self, name): return self._kbs[name] def __init__(self, dataset_path: str, embed: Optional[Union[Callable, Dict[str, Callable]]] = None, - create_ui: bool = False, manager: bool = False, launcher=None): + create_ui: bool = False, manager: bool = False, server: bool = False, launcher=None): super().__init__() - if not os.path.exists(dataset_path): - defatult_path = os.path.join(lazyllm.config["data_path"], dataset_path) - if os.path.exists(defatult_path): - dataset_path = defatult_path if create_ui: lazyllm.LOG.warning('`create_ui` for Document is deprecated, use `manager` instead') - self._manager = create_ui or manager - launcher = launcher if launcher else lazyllm.launchers.remote(sync=False) - self._local_file_reader: Dict[str, Callable] = {} - self._embed = embed if isinstance(embed, dict) else {EMBED_DEFAULT_KEY: embed} - for embed in self._embed.values(): - if isinstance(embed, ModuleBase): - self._submodules.append(embed) - - self._impl = DocGroupImpl(dataset_path=dataset_path, embed=self._embed, local_readers=self._local_file_reader, - global_readers=self._registered_file_reader) - if self._manager: - doc_manager = DocManager(self._impl) - self.doc_server = ServerModule(doc_manager, launcher=launcher) - self.web = DocWebModule(doc_server=self.doc_server) - - def forward(self, func_name: str, *args, **kwargs): - if self._manager: - kwargs["func_name"] = func_name - return self.doc_server.forward(*args, **kwargs) - else: - return getattr(self._impl, func_name)(*args, **kwargs) - - def find_parent(self, group: str) -> Callable: - return partial(self.forward, "find_parent", group=group) + self._impls = Document._Impl(dataset_path, embed, create_ui or manager, server, launcher) + self._curr_group = DocImpl.DEDAULT_GROUP_NAME - def find_children(self, group: str) -> Callable: - return partial(self.forward, "find_children", group=group) + def create_kb_group(self, name: str) -> "Document": + self._impls.add_kb_group(name) + doc = copy.copy(self) + doc._curr_group = name + return doc - def __repr__(self): - return lazyllm.make_repr("Module", "Document", manager=self._manager) + @property + def _impl(self): return self._impls.get_doc_by_kb_group(self._curr_group) @DynamicDescriptor def create_node_group(self, name: str = None, *, transform: Callable, parent: str = LAZY_ROOT_NAME, @@ -67,19 +62,22 @@ def create_node_group(self, name: str = None, *, transform: Callable, parent: st @DynamicDescriptor def add_reader(self, pattern: str, func: Optional[Callable] = None): if isinstance(self, type): - return self.register_global_reader(pattern=pattern, func=func) + return DocImpl.register_global_reader(pattern=pattern, func=func) else: - assert callable(func), 'func for reader should be callable' - self._local_file_reader[pattern] = func + self._impl.add_reader(pattern, func) @classmethod def register_global_reader(cls, pattern: str, func: Optional[Callable] = None): - if func is not None: - cls._registered_file_reader[pattern] = func - return None + return cls.add_reader(pattern, func) + + def find_parent(self) -> Callable: + return DocImpl.find_parent + + def find_children(self) -> Callable: + return DocImpl.find_children - def decorator(klass): - if callable(klass): cls._registered_file_reader[pattern] = klass - else: raise TypeError(f"The registered object {klass} is not a callable object.") - return klass - return decorator + def forward(self, *args, **kw) -> List[DocNode]: + return self._impl.retrieve(*args, **kw) + + def __repr__(self): + return lazyllm.make_repr("Module", "Document", manager=bool(self._manager)) diff --git a/lazyllm/tools/rag/group_doc.py b/lazyllm/tools/rag/group_doc.py deleted file mode 100644 index 8cb4c7a4..00000000 --- a/lazyllm/tools/rag/group_doc.py +++ /dev/null @@ -1,160 +0,0 @@ -import os -import shutil -from typing import Callable, List, Optional, Dict -from .doc_impl import DocImpl -import lazyllm -from .store import LAZY_ROOT_NAME - -DATA_DIR = "__data" -DEFAULT_DIR = "default" - - -class DocGroupImpl(lazyllm.ModuleBase): - def __init__(self, dataset_path, embed: Dict[str, Callable], local_readers: Optional[Dict] = None, - global_readers: Optional[Dict] = None) -> None: - super().__init__() - self._dataset_path = os.path.normpath(dataset_path) - self._embed = embed - assert os.path.exists(self.dataset_path), f"{self.dataset_path} does not exist" - - if DEFAULT_DIR not in self.list_groups(): - self.new_group(DEFAULT_DIR) - self._move_file_to_default() - - file_paths = self._list_all_files(self.dataset_path, lambda x: DATA_DIR in x) - self._impl: DocImpl = DocImpl( - doc_files=file_paths, embed=self._embed, local_readers=local_readers, global_readers=global_readers, - doc_name="lazyllm_doc" - ) - - @property - def dataset_path(self): - return self._dataset_path - - def _move_file_to_default(self): - try: - items = os.listdir(self.dataset_path) - items = [os.path.join(self.dataset_path, item) for item in items] - files = [item for item in items if os.path.isfile(item)] - - group_path = self.get_group_source_path(group_name=DEFAULT_DIR) - for file in files: - shutil.move(file, os.path.join(group_path, os.path.basename(file))) - - except Exception as e: - return str(e) - - def new_group(self, group_name: str): - if os.path.exists(self.get_group_path(group_name=group_name)): - raise Exception( - f"{group_name} already exists[{self.get_group_path(group_name=group_name)}]" - ) - - for path in [ - self.get_group_path(group_name), - self.get_gropu_data_path(group_name), - self.get_group_source_path(group_name), - ]: - os.makedirs(path) - - def delete_group(self, group_name: str): - try: - shutil.rmtree(self.get_group_path(group_name)) - list_files = self.list_files(group_name) - self._impl.delete_files(list_files) - except Exception as e: - raise Exception( - f"{self.get_group_path(group_name)} delete error, exception:{e}" - ) - - return f"delete {group_name} success" - - def list_groups(self): - groups = self._list_all_subdirectories( - self.dataset_path, lambda x: DATA_DIR not in x - ) - return [dir[len(self.dataset_path) + 1:] for dir in groups] - - def add_files(self, group_name: str, files: List[str]): - source_path = self.get_group_source_path(group_name) - files = [os.path.join(source_path, file_path) for file_path in files] - self._impl.add_files(files) - - def delete_files( - self, group_name: str, files: List[str], is_del_source: bool = True - ): - self._impl.delete_files(files) - - if not is_del_source: - return - - source_path = self.get_group_source_path(group_name) - for file_path in [os.path.join(source_path, file_path) for file_path in files]: - os.remove(file_path) - - def list_files(self, group_name: str) -> List[str]: - file_paths = self._list_all_files( - self.get_group_source_path(group_name=group_name), lambda x: DATA_DIR in x - ) - return [os.path.basename(file_path) for file_path in file_paths] - - def get_group_path(self, group_name: str): - return os.path.join(self.dataset_path, group_name) - - def get_gropu_data_path(self, group_name: str): - return os.path.join(self.get_group_path(group_name=group_name), DATA_DIR) - - def get_group_source_path(self, group_name: str): - return os.path.join(self.get_gropu_data_path(group_name=group_name), "sources") - - def _list_all_subdirectories(self, directory, filter_dir=None): - if not os.path.isabs(directory): - raise ValueError("directory must be an absolute path") - try: - subdirectories = [] - - for root, dirs, files in os.walk(directory): - dirs = [os.path.join(root, dir) for dir in dirs] - filtered_dirs = list(filter(filter_dir, dirs)) if filter else dirs - subdirectories.extend(filtered_dirs) - - return subdirectories - except Exception as e: - lazyllm.LOG.error(f"Error while listing subdirectories in {directory}: {e}") - return [] - - def _list_all_files(self, directory, filter_func=None): - if not os.path.isabs(directory): - raise ValueError("directory must be an absolute path") - - try: - files_list = [] - - for root, dirs, files in os.walk(directory): - files = [os.path.join(root, file_path) for file_path in files] - filtered_files = ( - list(filter(filter_func, files)) if filter_func else files - ) - files_list.extend(filtered_files) - - return files_list - except Exception as e: - lazyllm.LOG.error(f"Error while listing files in {directory}: {e}") - return [] - - def find_parent(self, *args, **kwargs): - return self._impl.find_parent(*args, **kwargs) - - def find_children(self, *args, **kwargs): - return self._impl.find_children(*args, **kwargs) - - def retrieve(self, *args, **kwargs): - return self._impl.retrieve(*args, **kwargs) - - def __repr__(self): - return lazyllm.make_repr("Module", "DocGroupImpl") - - def create_node_group(self, name: str, transform: Callable, parent: str = LAZY_ROOT_NAME, - *, trans_node: bool = None, num_workers: int = 0, **kwargs) -> None: - self._impl.create_node_group(name, transform=transform, parent=parent, trans_node=trans_node, - num_workers=num_workers, **kwargs) diff --git a/lazyllm/tools/rag/index.py b/lazyllm/tools/rag/index.py index 18136626..d9f6ad24 100644 --- a/lazyllm/tools/rag/index.py +++ b/lazyllm/tools/rag/index.py @@ -66,8 +66,8 @@ def _parallel_do_embedding(self, nodes: List[DocNode]) -> List[DocNode]: with node._lock: if node.has_missing_embedding(k): future = executor.submit(node.do_embedding, {k: self.embed[k]}) \ - if k not in node.embedding_state else executor.submit(node.check_embedding_state, k) - node.embedding_state.add(k) + if k not in node._embedding_state else executor.submit(node.check_embedding_state, k) + node._embedding_state.add(k) futures.append(future) if len(futures) > 0: for future in concurrent.futures.as_completed(futures): diff --git a/lazyllm/tools/rag/retriever.py b/lazyllm/tools/rag/retriever.py index a2808223..5629d503 100644 --- a/lazyllm/tools/rag/retriever.py +++ b/lazyllm/tools/rag/retriever.py @@ -61,7 +61,7 @@ def __init__( @once_wrapper def _lazy_init(self): - docs = [doc for doc in self._docs if self._group_name in doc._impl._impl.node_groups or self._group_name + docs = [doc for doc in self._docs if self._group_name in doc._impl.node_groups or self._group_name in DocImpl._builtin_node_groups or self._group_name in DocImpl._global_node_groups] if not docs: raise RuntimeError(f'Group {self._group_name} not found in document {self._docs}') self._docs = docs @@ -73,8 +73,7 @@ def forward(self, query: str) -> Union[List[DocNode], str]: self._lazy_init() nodes = [] for doc in self._docs: - nodes.extend(doc.forward(func_name="retrieve", query=query, group_name=self._group_name, - similarity=self._similarity, similarity_cut_off=self._similarity_cut_off, - index=self._index, topk=self._topk, similarity_kws=self._similarity_kw, - embed_keys=self._embed_keys)) + nodes.extend(doc.forward(query=query, group_name=self._group_name, similarity=self._similarity, + similarity_cut_off=self._similarity_cut_off, index=self._index, + topk=self._topk, similarity_kws=self._similarity_kw, embed_keys=self._embed_keys)) return self._post_process(nodes) diff --git a/lazyllm/tools/rag/store.py b/lazyllm/tools/rag/store.py index fdc62c82..8e5c30c2 100644 --- a/lazyllm/tools/rag/store.py +++ b/lazyllm/tools/rag/store.py @@ -25,15 +25,9 @@ class MetadataMode(str, Enum): class DocNode: - def __init__( - self, - uid: Optional[str] = None, - text: Optional[str] = None, - group: Optional[str] = None, - embedding: Optional[Dict[str, List[float]]] = None, - parent: Optional["DocNode"] = None, - metadata: Optional[Dict[str, Any]] = None - ) -> None: + def __init__(self, uid: Optional[str] = None, text: Optional[str] = None, group: Optional[str] = None, + embedding: Optional[Dict[str, List[float]]] = None, parent: Optional["DocNode"] = None, + metadata: Optional[Dict[str, Any]] = None, classfication: Optional[str] = None): self.uid: str = uid if uid else str(uuid.uuid4()) self.text: Optional[str] = text self.group: Optional[str] = group @@ -48,7 +42,9 @@ def __init__( self.is_saved: bool = False self._docpath = None self._lock = threading.Lock() - self.embedding_state = set() + self._embedding_state = set() + # store will create index cache for classfication to speed up retrieve + self._classfication = classfication @property def root_node(self) -> Optional["DocNode"]: @@ -132,7 +128,7 @@ def check_embedding_state(self, embed_key: str) -> None: while True: with self._lock: if not self.has_missing_embedding(embed_key): - self.embedding_state.discard(embed_key) + self._embedding_state.discard(embed_key) break time.sleep(1) diff --git a/tests/basic_tests/test_document.py b/tests/basic_tests/test_document.py index 680636f9..e62dc277 100644 --- a/tests/basic_tests/test_document.py +++ b/tests/basic_tests/test_document.py @@ -18,7 +18,7 @@ def setUp(self): self.mock_directory_reader.load_data.return_value = [mock_node] self.doc_impl = DocImpl(embed=self.mock_embed, doc_files=["dummy_file.txt"]) - self.doc_impl.directory_reader = self.mock_directory_reader + self.doc_impl._reader = self.mock_directory_reader def test_create_node_group_default(self): self.doc_impl._create_builtin_node_group('MyChunk', transform=lambda x: ','.split(x)) @@ -63,11 +63,11 @@ def test_add_files(self): new_doc = DocNode(text="new dummy text", group=LAZY_ROOT_NAME) new_doc.metadata = {"file_name": "new_file.txt"} self.mock_directory_reader.load_data.return_value = [new_doc] - self.doc_impl.add_files(["new_file.txt"]) + self.doc_impl._add_files(["new_file.txt"]) assert len(self.doc_impl.store.traverse_nodes(LAZY_ROOT_NAME)) == 2 def test_delete_files(self): - self.doc_impl.delete_files(["dummy_file.txt"]) + self.doc_impl._delete_files(["dummy_file.txt"]) assert len(self.doc_impl.store.traverse_nodes(LAZY_ROOT_NAME)) == 0 @@ -90,15 +90,15 @@ def test_register_global_and_local(self): f=SentenceSplitter, kwargs=dict(chunk_size=128, chunk_overlap=10))) doc2.create_node_group('Chunk3', trans_node=True, transform=lazyllm.pipeline(SentenceSplitter(chunk_size=128, chunk_overlap=10))) - doc1._impl._impl._lazy_init() - doc2._impl._impl._lazy_init() - assert doc1._impl._impl.node_groups['Chunk1']['transform']['kwargs']['chunk_size'] == 512 - assert doc1._impl._impl.node_groups['Chunk2']['transform']['kwargs']['chunk_size'] == 256 - assert doc2._impl._impl.node_groups['Chunk1']['transform']['kwargs']['chunk_size'] == 512 - assert doc2._impl._impl.node_groups['Chunk2']['transform']['kwargs']['chunk_size'] == 128 - assert 'Chunk3' not in doc1._impl._impl.node_groups - assert isinstance(doc2._impl._impl.node_groups['Chunk3']['transform']['f'], lazyllm.pipeline) - assert doc2._impl._impl.node_groups['Chunk3']['transform']['trans_node'] is True + doc1._impl._lazy_init() + doc2._impl._lazy_init() + assert doc1._impl.node_groups['Chunk1']['transform']['kwargs']['chunk_size'] == 512 + assert doc1._impl.node_groups['Chunk2']['transform']['kwargs']['chunk_size'] == 256 + assert doc2._impl.node_groups['Chunk1']['transform']['kwargs']['chunk_size'] == 512 + assert doc2._impl.node_groups['Chunk2']['transform']['kwargs']['chunk_size'] == 128 + assert 'Chunk3' not in doc1._impl.node_groups + assert isinstance(doc2._impl.node_groups['Chunk3']['transform']['f'], lazyllm.pipeline) + assert doc2._impl.node_groups['Chunk3']['transform']['trans_node'] is True retriever = Retriever([doc1, doc2], 'Chunk2', similarity='bm25', topk=2) r = retriever('什么是道') @@ -124,7 +124,7 @@ def test_register_with_pattern(self): dict(f=SentenceSplitter, pattern='*.txt', kwargs=dict(chunk_size=512, chunk_overlap=50)), TransformArgs(f=SentenceSplitter, pattern=None, kwargs=dict(chunk_size=256, chunk_overlap=25))])) doc = Document('rag_master') - doc._impl._impl._lazy_init() + doc._impl._lazy_init() retriever = Retriever(doc, 'AdaptiveChunk1', similarity='bm25', topk=2) retriever('什么是道') retriever = Retriever(doc, 'AdaptiveChunk2', similarity='bm25', topk=2) diff --git a/tests/basic_tests/test_rag_reader.py b/tests/basic_tests/test_rag_reader.py index a405c1ec..cfca9354 100644 --- a/tests/basic_tests/test_rag_reader.py +++ b/tests/basic_tests/test_rag_reader.py @@ -25,9 +25,9 @@ def setup_method(self): self.datasets = os.path.join(lazyllm.config['data_path'], "ci_data/rag_reader/default/__data/sources") def teardown_method(self): - self.doc1._local_file_reader = {} - self.doc2._local_file_reader = {} - Document._registered_file_reader = {} + self.doc1._impl._local_file_reader = {} + self.doc2._impl._local_file_reader = {} + type(self.doc1._impl)._registered_file_reader = {} def test_reader_file(self): files = [os.path.join(self.datasets, "联网搜索.pdf"), os.path.join(self.datasets, "说明文档测试.docx")] @@ -49,22 +49,22 @@ def test_reader_dir(self): def test_register_local_reader(self): self.doc1.add_reader("**/*.yml", processYml) files = [os.path.join(self.datasets, "reader_test.yml")] - docs = self.doc1._impl._impl.directory_reader.load_data(input_files=files) + docs = self.doc1._impl._reader.load_data(input_files=files) assert docs[0].text == "Call the function processYml." def test_register_global_reader(self): Document.register_global_reader("**/*.yml", processYml) files = [os.path.join(self.datasets, "reader_test.yml")] - docs = self.doc1._impl._impl.directory_reader.load_data(input_files=files) + docs = self.doc1._impl._reader.load_data(input_files=files) assert docs[0].text == "Call the function processYml." def test_register_local_and_global_reader(self): files = [os.path.join(self.datasets, "reader_test.yml")] - docs1 = self.doc1._impl._impl.directory_reader.load_data(input_files=files) + docs1 = self.doc1._impl._reader.load_data(input_files=files) assert docs1[0].text != "Call the class YmlReader." and docs1[0].text != "Call the function processYml." Document.add_reader("**/*.yml", processYml) self.doc1.add_reader("**/*.yml", YmlReader) - docs1 = self.doc1._impl._impl.directory_reader.load_data(input_files=files) - docs2 = self.doc2._impl._impl.directory_reader.load_data(input_files=files) + docs1 = self.doc1._impl._reader.load_data(input_files=files) + docs2 = self.doc2._impl._reader.load_data(input_files=files) assert docs1[0].text == "Call the class YmlReader." and docs2[0].text == "Call the function processYml."