Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rag readers for replace llamaindex #239

Merged
merged 55 commits into from
Sep 15, 2024
Merged
Changes from 1 commit
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
3be037b
add lazyllm before group
Aug 8, 2024
9337e4c
Merge remote-tracking branch 'upstream/main'
Aug 8, 2024
67f086c
Merge remote-tracking branch 'upstream/main'
Aug 9, 2024
dbf3280
Merge remote-tracking branch 'upstream/main'
Aug 12, 2024
d2421d2
Merge remote-tracking branch 'upstream/main'
Aug 13, 2024
b1169fe
Merge remote-tracking branch 'upstream/main'
Aug 14, 2024
2d73df9
Merge remote-tracking branch 'upstream/main'
Aug 14, 2024
e3193b1
Merge remote-tracking branch 'upstream/main'
Aug 14, 2024
80fc6c5
Merge remote-tracking branch 'upstream/main'
Aug 14, 2024
8a1b54b
Merge remote-tracking branch 'upstream/main'
Aug 15, 2024
ef51ad1
Merge remote-tracking branch 'upstream/main'
Aug 16, 2024
3d39b4e
Merge remote-tracking branch 'upstream/main'
Aug 19, 2024
63acf1a
Merge remote-tracking branch 'upstream/main'
Aug 20, 2024
5d8be06
Merge remote-tracking branch 'upstream/main'
Aug 20, 2024
a0cdd74
Merge remote-tracking branch 'upstream/main'
Aug 23, 2024
c44f5ec
Merge remote-tracking branch 'upstream/main'
Aug 23, 2024
5093af2
Merge remote-tracking branch 'upstream/main'
Aug 24, 2024
214ec1b
Merge remote-tracking branch 'upstream/main'
Aug 28, 2024
2224adc
Merge remote-tracking branch 'upstream/main'
Aug 29, 2024
ec59048
Merge remote-tracking branch 'upstream/main'
Aug 29, 2024
92e9c6b
Merge remote-tracking branch 'upstream/main'
Aug 30, 2024
e079af8
Merge remote-tracking branch 'upstream/main'
Aug 30, 2024
02892f2
Merge remote-tracking branch 'upstream/main'
Aug 30, 2024
9348f06
Merge remote-tracking branch 'upstream/main'
Aug 30, 2024
3bd6c6a
Merge remote-tracking branch 'upstream/main'
Sep 2, 2024
e11d29d
Merge remote-tracking branch 'upstream/main'
Sep 2, 2024
1494288
Merge remote-tracking branch 'upstream/main'
Sep 3, 2024
2a9c4fe
Merge remote-tracking branch 'upstream/main'
Sep 3, 2024
61cfb0c
Merge remote-tracking branch 'upstream/main'
Sep 3, 2024
4558336
Merge remote-tracking branch 'upstream/main'
Sep 4, 2024
407b84e
Merge remote-tracking branch 'upstream/main'
Sep 4, 2024
ce3e287
Merge remote-tracking branch 'upstream/main'
Sep 5, 2024
4546f52
Merge remote-tracking branch 'upstream/main'
Sep 5, 2024
cbf4a7e
add readers for rag
Sep 10, 2024
6357946
Merge remote-tracking branch 'upstream/main'
Sep 10, 2024
072a2ed
Merge branch 'main' into wj/rag_reader
Sep 10, 2024
aab9c47
optimizing reader register process
Sep 12, 2024
306c115
Merge remote-tracking branch 'upstream/main'
Sep 12, 2024
796d801
merge main branch
Sep 12, 2024
7b6bc80
optimizing rag readers register
Sep 13, 2024
9a92680
Merge remote-tracking branch 'upstream/main'
Sep 13, 2024
a42a4c1
Merge branch 'main' into wj/rag_reader
Sep 13, 2024
973c023
add two package for readers in unit tests
Sep 13, 2024
5eca597
modify rag reader file path in unit test
Sep 13, 2024
c03756b
Optimize the usage priority of local and global registered file readers
Sep 14, 2024
706a734
Merge remote-tracking branch 'upstream/main'
Sep 14, 2024
89463f8
Merge branch 'main' into wj/rag_reader
Sep 14, 2024
e5d70eb
modify reader usage priority
Sep 14, 2024
ab90f96
Merge remote-tracking branch 'upstream/main'
Sep 14, 2024
55d5840
Merge branch 'main' into wj/rag_reader
Sep 14, 2024
0d70ed8
modify pattern in readers of unittests
Sep 14, 2024
8b56a64
modify reader test bug in unit tests
Sep 14, 2024
66ef831
Merge remote-tracking branch 'upstream/main'
Sep 14, 2024
2524a5a
Merge branch 'main' into wj/rag_reader
Sep 14, 2024
36c967a
remove yaml package in rag reader of unittests
Sep 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
optimizing rag readers register
  • Loading branch information
wangjian committed Sep 13, 2024
commit 7b6bc80159a97eab2bc7da982009d54b14caaea9
3 changes: 1 addition & 2 deletions lazyllm/__init__.py
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@
from .client import redis_client
from .tools import (Document, Reranker, Retriever, WebModule, ToolManager, FunctionCall,
FunctionCallAgent, fc_register, ReactAgent, PlanAndSolveAgent, ReWOOAgent, SentenceSplitter,
LLMParser, SimpleDirectoryReader)
LLMParser)
from .docs import add_doc

config.done()
@@ -73,7 +73,6 @@
'PlanAndSolveAgent',
'ReWOOAgent',
'SentenceSplitter',
'SimpleDirectoryReader',

# docs
'add_doc',
3 changes: 1 addition & 2 deletions lazyllm/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .rag import (Document, Reranker, Retriever, SentenceSplitter, LLMParser, SimpleDirectoryReader)
from .rag import Document, Reranker, Retriever, SentenceSplitter, LLMParser
from .webpages import WebModule
from .agent import (
ToolManager,
@@ -29,5 +29,4 @@
"SentenceSplitter",
"SQLiteTool",
"SqlModule",
"SimpleDirectoryReader",
]
88 changes: 42 additions & 46 deletions lazyllm/tools/rag/dataReader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
"""
The overall process of SimpleDirectoryReader is borrowed from LLAMA_INDEX, but we have added a customized part
based on it, that is, allowing users to register custom rules instead of processing only based on file suffixes.
"""
import os
import mimetypes
import multiprocessing
@@ -6,12 +10,12 @@
from datetime import datetime
from functools import reduce
from itertools import repeat
from typing import Dict, Optional, List, Callable
from typing import Dict, Optional, List, Callable, Type
from pathlib import Path, PurePosixPath, PurePath
from fsspec import AbstractFileSystem
from lazyllm import ModuleBase, LOG
from .store import DocNode
from .readers import (PDFReader, DocxReader, HWPReader, PPTXReader, ImageReader, IPYNBReader,
from .readers import (ReaderBase, PDFReader, DocxReader, HWPReader, PPTXReader, ImageReader, IPYNBReader,
EpubReader, MarkdownReader, MboxReader, PandasCSVReader, PandasExcelReader, VideoAudioReader,
get_default_fs, is_default_fs)

@@ -51,6 +55,29 @@ def __call__(self, file_path: str) -> Dict:
return {meta_key: meta_value for meta_key, meta_value in default_meta.items() if meta_value is not None}

class SimpleDirectoryReader(ModuleBase):
default_file_readers: Dict[str, Type[ReaderBase]] = {
"*.pdf": PDFReader,
"*.docx": DocxReader,
"*.hwp": HWPReader,
"*.pptx": PPTXReader,
"*.ppt": PPTXReader,
"*.pptm": PPTXReader,
"*.gif": ImageReader,
"*.jpeg": ImageReader,
"*.jpg": ImageReader,
"*.png": ImageReader,
"*.webp": ImageReader,
"*.ipynb": IPYNBReader,
"*.epub": EpubReader,
"*.md": MarkdownReader,
"*.mbox": MboxReader,
"*.csv": PandasCSVReader,
"*.xls": PandasExcelReader,
"*.xlsx": PandasExcelReader,
"*.mp3": VideoAudioReader,
"*.mp4": VideoAudioReader,
}

def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List] = None,
exclude: Optional[List] = None, exclude_hidden: bool = True, recursive: bool = False,
encoding: str = "utf-8", filename_as_id: bool = False, required_exts: Optional[List[str]] = None,
@@ -59,7 +86,7 @@ def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List]
return_trace: bool = False) -> None:
super().__init__(return_trace=return_trace)

if not input_dir and not input_files:
if (not input_dir and not input_files) or (input_dir and input_files):
raise ValueError("Must provide either `input_dir` or `input_files`.")

self._fs = fs or get_default_fs()
@@ -85,10 +112,7 @@ def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List]
self._input_dir = self._Path(input_dir)
self._input_files = self._add_files(self._input_dir)

if file_extractor is not None:
self._file_extractor = file_extractor
else:
self._file_extractor = {}
self._file_extractor = file_extractor or {}

self._file_metadata = file_metadata or _DefaultFileMetadataFunc(self._fs)
self._filename_as_id = filename_as_id
@@ -166,25 +190,21 @@ def _exclude_metadata(self, documents: List[DocNode]) -> List[DocNode]:

@staticmethod
def load_file(input_file: Path, file_metadata: Callable[[str], Dict], file_extractor: Dict[str, Callable],
filename_as_id: bool = False, encoding: str = "utf-8", _Path: PurePath = Path,
filename_as_id: bool = False, encoding: str = "utf-8", pathm: PurePath = Path,
fs: Optional[AbstractFileSystem] = None) -> List[DocNode]:
default_file_reader = SimpleDirectoryReader._loading_default_file_reader()
default_file_reader_patterns = list(default_file_reader.keys())
metadata: Optional[dict] = None
documents: List[DocNode] = []

if file_metadata is not None: metadata = file_metadata(str(input_file))

file_reader_patterns = list(file_extractor.keys())
file_reader_patterns.extend(default_file_reader_patterns)

for pattern in file_reader_patterns:
pt = str(_Path(pattern))
match_pattern = pt if pt.startswith("*") else os.path.join(str(_Path.cwd()), pt)
pt = str(pathm(pattern))
match_pattern = pt if pt.startswith("*") else os.path.join(str(pathm.cwd()), pt)
if fnmatch.fnmatch(input_file, match_pattern):
reader = file_extractor[pattern] if pattern not in default_file_reader_patterns \
else default_file_reader[pattern]

reader = file_extractor[pattern]
reader = reader() if isinstance(reader, type) else reader
kwargs = {"extra_info": metadata}
if fs and not is_default_fs(fs): kwargs['fs'] = fs
docs = reader(input_file, **kwargs)
@@ -212,13 +232,14 @@ def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = N
fs = fs or self._fs
process_file = self._input_files

if num_workers and num_workers > 1:
if num_workers and num_workers >= 1:
if num_workers > multiprocessing.cpu_count():
LOG.warning("Specified num_workers exceed number of CPUs in the system. "
"Setting `num_workers` down to the maximum CPU count.")
with multiprocessing.get_context("spawn").Pool(num_workers) as p:
results = p.starmap(SimpleDirectoryReader.load_file,
zip(process_file, repeat(self._file_metadata), repeat(self._file_extractor),
zip(process_file, repeat(self._file_metadata),
repeat({**self._file_extractor, **self.default_file_readers}),
repeat(self._filename_as_id), repeat(self._encoding), repeat(self._Path),
repeat(self._fs)))
documents = reduce(lambda x, y: x + y, results)
@@ -228,36 +249,11 @@ def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = N
for input_file in process_file:
documents.extend(
SimpleDirectoryReader.load_file(input_file=input_file, file_metadata=self._file_metadata,
file_extractor=self._file_extractor, encoding=self._encoding,
filename_as_id=self._filename_as_id, _Path=self._Path, fs=self._fs))
file_extractor={**self._file_extractor, **self.default_file_readers},
filename_as_id=self._filename_as_id, encoding=self._encoding,
pathm=self._Path, fs=self._fs))

return self._exclude_metadata(documents)

def forward(self, *args, **kwargs) -> List[DocNode]:
return self._load_data(*args, **kwargs)

@staticmethod
def _loading_default_file_reader() -> Dict[str, Callable]:
default_file_reader: Dict[str, Callable] = {
"*.pdf": PDFReader(),
"*.docx": DocxReader(),
"*.hwp": HWPReader(),
"*.pptx": PPTXReader(),
"*.ppt": PPTXReader(),
"*.pptm": PPTXReader(),
"*.gif": ImageReader(),
"*.jpeg": ImageReader(),
"*.jpg": ImageReader(),
"*.png": ImageReader(),
"*.webp": ImageReader(),
"*.ipynb": IPYNBReader(),
"*.epub": EpubReader(),
"*.md": MarkdownReader(),
"*.mbox": MboxReader(),
"*.csv": PandasCSVReader(),
"*.xls": PandasExcelReader(),
"*.xlsx": PandasExcelReader(),
"*.mp3": VideoAudioReader(),
"*.mp4": VideoAudioReader(),
}
return default_file_reader
19 changes: 11 additions & 8 deletions lazyllm/tools/rag/data_loaders.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,27 @@
from typing import List, Optional, Dict, Callable
from typing import List, Optional, Dict
from .store import DocNode, LAZY_ROOT_NAME
from lazyllm import LOG
from .dataReader import SimpleDirectoryReader

class DirectoryReader:
def __init__(self, input_files: List[str], readers: Optional[Dict] = None) -> None:
def __init__(self, input_files: List[str], local_readers: Optional[Dict] = None,
global_readers: Optional[Dict] = None) -> None:
self._input_files = input_files
self._readers = readers
self._local_readers = local_readers
self._global_readers = global_readers

def load_data(self, input_files: Optional[List[str]] = None) -> List[DocNode]:
input_files = input_files or self._input_files

LOG.info(f"DirectoryReader loads data, input files: {input_files}")
reader = SimpleDirectoryReader(input_files=input_files, file_extractor=self._readers)
nodes = [doc for doc in reader() if setattr(doc, 'group', LAZY_ROOT_NAME) is None]
reader = SimpleDirectoryReader(input_files=input_files,
file_extractor={**self._local_readers, **self._global_readers})
wangjian052163 marked this conversation as resolved.
Show resolved Hide resolved
nodes: List[DocNode] = []
for doc in reader():
doc.group = LAZY_ROOT_NAME
nodes.append(doc)
if not nodes:
LOG.warning(
f"No nodes load from path {self.input_files}, please check your data path."
)
return nodes

def register_file_reader(self, readers: Dict[str, Callable]):
self._readers = readers
5 changes: 3 additions & 2 deletions lazyllm/tools/rag/doc_impl.py
Original file line number Diff line number Diff line change
@@ -25,9 +25,10 @@ def wrapper(*args, **kwargs) -> List[float]:


class DocImpl:
def __init__(self, embed, doc_files=Optional[List[str]], readers: Optional[Dict] = None, **kwargs):
def __init__(self, embed, doc_files=Optional[List[str]], local_readers: Optional[Dict] = None,
global_readers: Optional[Dict] = None, **kwargs):
super().__init__()
self.directory_reader = DirectoryReader(doc_files, readers=readers)
self.directory_reader = DirectoryReader(doc_files, local_readers=local_readers, global_readers=global_readers)
self.node_groups: Dict[str, Dict] = {LAZY_ROOT_NAME: {}}
self._create_node_group_default()
self.embed = embed_wrapper(embed)
24 changes: 9 additions & 15 deletions lazyllm/tools/rag/document.py
Original file line number Diff line number Diff line change
@@ -23,9 +23,10 @@ def __init__(self, dataset_path: str, embed: Optional[TrainableModule] = None,
dataset_path = defatult_path
self._create_ui = create_ui
launcher = launcher if launcher else lazyllm.launchers.remote(sync=False)
self._instance_registered_file_reader: Dict[str, Callable] = {}
self._local_file_reader: Dict[str, Callable] = {}

self._impl = DocGroupImpl(dataset_path=dataset_path, embed=embed, readers=self._registered_file_reader)
self._impl = DocGroupImpl(dataset_path=dataset_path, embed=embed, local_readers=self._local_file_reader,
global_readers=self._registered_file_reader)
if create_ui:
doc_manager = DocManager(self._impl)
self.doc_server = ServerModule(doc_manager, launcher=launcher)
@@ -52,23 +53,16 @@ def create_node_group(
) -> None:
self._impl.create_node_group(name, transform, parent, **kwargs)

def register_instance_file_reader(self, readers: Dict[str, Callable]):
self._instance_registered_file_reader.update(readers)
self._impl._impl.directory_reader.register_file_reader({**self._instance_registered_file_reader,
**self._registered_file_reader})
def add_reader(self, pattern: str, func: Callable):
self._local_file_reader[pattern] = func

@classmethod
def register_cls_file_reader(cls, match_key: str):
if isinstance(match_key, type):
raise TypeError("Document.register_file_reader() missing 1 required positional argument: 'match_key'")
def register_global_reader(cls, pattern: str, func: Optional[Callable] = None):
if func is not None:
cls._registered_file_reader[pattern] = func

def decorator(klass):
if isinstance(klass, type): cls._registered_file_reader[match_key] = klass()
elif callable(klass): cls._registered_file_reader[match_key] = klass
if callable(klass): cls._registered_file_reader[pattern] = klass
else: raise TypeError(f"The registered object {klass} is not a callable object.")
return klass
return decorator

@classmethod
def get_registry(cls):
return cls._registered_file_reader
6 changes: 4 additions & 2 deletions lazyllm/tools/rag/group_doc.py
Original file line number Diff line number Diff line change
@@ -10,7 +10,8 @@


class DocGroupImpl(lazyllm.ModuleBase):
def __init__(self, dataset_path, embed, readers: Optional[Dict] = None) -> None:
def __init__(self, dataset_path, embed, local_readers: Optional[Dict] = None,
global_readers: Optional[Dict] = None) -> None:
super().__init__()
self._dataset_path = dataset_path
self._embed = embed
@@ -22,7 +23,8 @@ def __init__(self, dataset_path, embed, readers: Optional[Dict] = None) -> None:

file_paths = self._list_all_files(self.dataset_path, lambda x: DATA_DIR in x)
self._impl: DocImpl = DocImpl(
doc_files=file_paths, embed=self._embed, readers=readers, doc_name="lazyllm_doc"
doc_files=file_paths, embed=self._embed, local_readers=local_readers, global_readers=global_readers,
doc_name="lazyllm_doc"
)

@property
1 change: 1 addition & 0 deletions lazyllm/tools/rag/readers/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Each reader module is borrowd from LLAMA_INDEX, but we have added customized parts, including the entire reader, which is inherited from the Modulebase base class, making all reader modules callable.
6 changes: 3 additions & 3 deletions tests/basic_tests/test_rag_reader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os
import lazyllm
from lazyllm import SimpleDirectoryReader, Document
from lazyllm import Document
from lazyllm.tools.rag.readers import ReaderBase
from lazyllm.tools.rag import DocNode
from lazyllm.tools.rag import SimpleDirectoryReader, DocNode

class YmlReader(ReaderBase):
def _load_data(self, file, extra_info=None, fs=None):
@@ -38,7 +38,7 @@ def test_reader_dir(self):
assert len(docs) == 13

def test_register_reader(self):
self.doc.register_instance_file_reader({lazyllm.config['data_path'] + "/**/*.yml": YmlReader()})
self.doc.add_reader("/**/*.yml", YmlReader)
files = [os.path.join(self.datasets, "reader_test.yml")]
docs = self.doc._impl._impl.directory_reader.load_data(input_files=files)
assert len(docs) == 1
wangjian052163 marked this conversation as resolved.
Show resolved Hide resolved