From 3be037bf8b2b1682cd9457c83318c42f53fc13f0 Mon Sep 17 00:00:00 2001 From: wangjian Date: Thu, 8 Aug 2024 20:45:41 +0800 Subject: [PATCH 01/11] add lazyllm before group --- README.CN.md | 4 ++-- README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.CN.md b/README.CN.md index 8942563c..6aca225d 100644 --- a/README.CN.md +++ b/README.CN.md @@ -269,9 +269,9 @@ def test(input): def test_cmd(input): return f'echo input is {input}' -# >>> demo.test()(1) +# >>> lazyllm.demo.test()(1) # 'input is 1' -# >>> demo.test_cmd(launcher=launchers.slurm)(2) +# >>> lazyllm.demo.test_cmd(launcher=launchers.slurm)(2) # Command: srun -p pat_rd -N 1 --job-name=xf488db3 -n1 bash -c 'echo input is 2' ``` diff --git a/README.md b/README.md index 57dbd723..34533877 100644 --- a/README.md +++ b/README.md @@ -276,9 +276,9 @@ def test(input): def test_cmd(input): return f'echo input is {input}' -# >>> demo.test()(1) +# >>> lazyllm.demo.test()(1) # 'input is 1' -# >>> demo.test_cmd(launcher=launchers.slurm)(2) +# >>> lazyllm.demo.test_cmd(launcher=launchers.slurm)(2) # Command: srun -p pat_rd -N 1 --job-name=xf488db3 -n1 bash -c 'echo input is 2' ``` From cbf4a7e9d52a1be915ed1e829ab944e020bd8523 Mon Sep 17 00:00:00 2001 From: wangjian Date: Tue, 10 Sep 2024 16:44:31 +0800 Subject: [PATCH 02/11] add readers for rag --- lazyllm/__init__.py | 3 +- lazyllm/tools/__init__.py | 3 +- lazyllm/tools/rag/__init__.py | 16 ++ lazyllm/tools/rag/dataReader.py | 256 ++++++++++++++++++ lazyllm/tools/rag/data_loaders.py | 14 +- lazyllm/tools/rag/readers/__init__.py | 30 ++ lazyllm/tools/rag/readers/docxReader.py | 25 ++ lazyllm/tools/rag/readers/epubReader.py | 33 +++ lazyllm/tools/rag/readers/hwpReader.py | 92 +++++++ lazyllm/tools/rag/readers/imageReader.py | 100 +++++++ lazyllm/tools/rag/readers/ipynbReader.py | 36 +++ lazyllm/tools/rag/readers/markdownReader.py | 67 +++++ lazyllm/tools/rag/readers/mboxreader.py | 67 +++++ lazyllm/tools/rag/readers/pandasReader.py | 71 +++++ lazyllm/tools/rag/readers/pdfReader.py | 44 +++ lazyllm/tools/rag/readers/pptxReader.py | 80 ++++++ lazyllm/tools/rag/readers/readerBase.py | 31 +++ lazyllm/tools/rag/readers/videoAudioReader.py | 48 ++++ lazyllm/tools/rag/store.py | 3 +- 19 files changed, 1004 insertions(+), 15 deletions(-) create mode 100644 lazyllm/tools/rag/dataReader.py create mode 100644 lazyllm/tools/rag/readers/__init__.py create mode 100644 lazyllm/tools/rag/readers/docxReader.py create mode 100644 lazyllm/tools/rag/readers/epubReader.py create mode 100644 lazyllm/tools/rag/readers/hwpReader.py create mode 100644 lazyllm/tools/rag/readers/imageReader.py create mode 100644 lazyllm/tools/rag/readers/ipynbReader.py create mode 100644 lazyllm/tools/rag/readers/markdownReader.py create mode 100644 lazyllm/tools/rag/readers/mboxreader.py create mode 100644 lazyllm/tools/rag/readers/pandasReader.py create mode 100644 lazyllm/tools/rag/readers/pdfReader.py create mode 100644 lazyllm/tools/rag/readers/pptxReader.py create mode 100644 lazyllm/tools/rag/readers/readerBase.py create mode 100644 lazyllm/tools/rag/readers/videoAudioReader.py diff --git a/lazyllm/__init__.py b/lazyllm/__init__.py index 4c8e6395..7b03c829 100644 --- a/lazyllm/__init__.py +++ b/lazyllm/__init__.py @@ -16,7 +16,7 @@ from .client import redis_client from .tools import (Document, Reranker, Retriever, WebModule, ToolManager, FunctionCall, FunctionCallAgent, fc_register, ReactAgent, PlanAndSolveAgent, ReWOOAgent, SentenceSplitter, - LLMParser) + LLMParser, SimpleDirectoryReader) from .engine import * # noqa F403 from .docs import add_doc @@ -75,6 +75,7 @@ 'PlanAndSolveAgent', 'ReWOOAgent', 'SentenceSplitter', + 'SimpleDirectoryReader', # docs 'add_doc', diff --git a/lazyllm/tools/__init__.py b/lazyllm/tools/__init__.py index 09789418..676f54f9 100644 --- a/lazyllm/tools/__init__.py +++ b/lazyllm/tools/__init__.py @@ -1,4 +1,4 @@ -from .rag import Document, Reranker, Retriever, SentenceSplitter, LLMParser +from .rag import (Document, Reranker, Retriever, SentenceSplitter, LLMParser, SimpleDirectoryReader) from .webpages import WebModule from .agent import ( ToolManager, @@ -29,4 +29,5 @@ "SentenceSplitter", "SQLiteTool", "SqlModule", + "SimpleDirectoryReader", ] diff --git a/lazyllm/tools/rag/__init__.py b/lazyllm/tools/rag/__init__.py index 444bdb26..2d1a89b6 100644 --- a/lazyllm/tools/rag/__init__.py +++ b/lazyllm/tools/rag/__init__.py @@ -4,6 +4,9 @@ from .transform import SentenceSplitter, LLMParser from .index import register_similarity from .store import DocNode +from .readers import (PDFReader, DocxReader, HWPReader, PPTXReader, ImageReader, IPYNBReader, EpubReader, + MarkdownReader, MboxReader, PandasCSVReader, PandasExcelReader, VideoAudioReader) +from .dataReader import SimpleDirectoryReader __all__ = [ @@ -15,4 +18,17 @@ "register_similarity", "register_reranker", "DocNode", + "PDFReader", + "DocxReader", + "HWPReader", + "PPTXReader", + "ImageReader", + "IPYNBReader", + "EpubReader", + "MarkdownReader", + "MboxReader", + "PandasCSVReader", + "PandasExcelReader", + "VideoAudioReader", + "SimpleDirectoryReader", ] diff --git a/lazyllm/tools/rag/dataReader.py b/lazyllm/tools/rag/dataReader.py new file mode 100644 index 00000000..f4e743ee --- /dev/null +++ b/lazyllm/tools/rag/dataReader.py @@ -0,0 +1,256 @@ +import os +import mimetypes +import multiprocessing +from tqdm import tqdm +from datetime import datetime +from functools import reduce +from itertools import repeat +from typing import Dict, Type, Optional, List, Callable +from pathlib import Path, PurePosixPath +from fsspec import AbstractFileSystem +from lazyllm import ModuleBase, LOG +from .store import DocNode +from .readers import (ReaderBase, PDFReader, DocxReader, HWPReader, PPTXReader, ImageReader, IPYNBReader, + EpubReader, MarkdownReader, MboxReader, PandasCSVReader, PandasExcelReader, VideoAudioReader, + get_default_fs, is_default_fs) + +def _file_timestamp_format(timestamp: float, include_time: bool = False) -> Optional[str]: + try: + if include_time: + return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%dT%H:%M:%SZ") + return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d") + except Exception: + return None + +class _DefaultFileMetadataFunc: + def __init__(self, fs: Optional[AbstractFileSystem] = None): + self._fs = fs or get_default_fs() + + def __call__(self, file_path: str) -> Dict: + stat_result = self._fs.stat(file_path) + + try: + file_name = os.path.basename(str(stat_result['name'])) + except Exception: + file_name = os.path.basename(file_path) + + creation_date = _file_timestamp_format(stat_result.get("created")) + last_modified_date = _file_timestamp_format(stat_result.get("mtime")) + last_accessed_date = _file_timestamp_format(stat_result.get("atime")) + default_meta = { + "file_path": file_path, + "file_name": file_name, + "file_type": mimetypes.guess_type(file_path)[0], + "file_size": stat_result.get("size"), + "creation_date": creation_date, + "last_modified_date": last_modified_date, + "last_accessed_date": last_accessed_date, + } + + return {meta_key: meta_value for meta_key, meta_value in default_meta.items() if meta_value is not None} + +class SimpleDirectoryReader(ModuleBase): + _registered_file_reader_cls: Dict[str, Type[ReaderBase]] = { + ".pdf": PDFReader, + ".docx": DocxReader, + ".hwp": HWPReader, + ".pptx": PPTXReader, + ".ppt": PPTXReader, + ".pptm": PPTXReader, + ".gif": ImageReader, + ".jpeg": ImageReader, + ".jpg": ImageReader, + ".png": ImageReader, + ".webp": ImageReader, + ".ipynb": IPYNBReader, + ".epub": EpubReader, + ".md": MarkdownReader, + ".mbox": MboxReader, + ".csv": PandasCSVReader, + ".xls": PandasExcelReader, + ".xlsx": PandasExcelReader, + ".mp3": VideoAudioReader, + ".mp4": VideoAudioReader, + } + + def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List] = None, + exclude: Optional[List] = None, exclude_hidden: bool = True, recursive: bool = False, + encoding: str = "utf-8", filename_as_id: bool = False, required_exts: Optional[List[str]] = None, + fs: Optional[AbstractFileSystem] = None, file_metadata: Optional[Callable[[str], Dict]] = None, + num_files_limit: Optional[int] = None, return_trace: bool = False) -> None: + super().__init__(return_trace=return_trace) + + if not input_dir and not input_files: + raise ValueError("Must provide either `input_dir` or `input_files`.") + + self._fs = fs or get_default_fs() + self._encoding = encoding + + self._exclude = exclude + self._recursive = recursive + self._exclude_hidden = exclude_hidden + self._required_exts = required_exts + self._num_files_limit = num_files_limit + _Path = Path if is_default_fs(self._fs) else PurePosixPath + + if input_files: + self._input_files = [] + for path in input_files: + if not self._fs.isfile(path): + raise ValueError(f"File {path} does not exist.") + input_file = _Path(path) + self._input_files.append(input_file) + elif input_dir: + if not self._fs.isdir(input_dir): + raise ValueError(f"Directory {input_dir} does not exist.") + self._input_dir = _Path(input_dir) + self._input_files = self._add_files(self._input_dir) + + self._file_metadata = file_metadata or _DefaultFileMetadataFunc(self._fs) + self._filename_as_id = filename_as_id + + def _add_files(self, input_dir: Path) -> List[Path]: # noqa: C901 + all_files = set() + rejected_files = set() + rejected_dirs = set() + _Path = Path if is_default_fs(self._fs) else PurePosixPath + + if self._exclude is not None: + for excluded_pattern in self._exclude: + if self._recursive: + excluded_glob = _Path(input_dir) / _Path("**") / excluded_pattern + else: + excluded_glob = _Path(input_dir) / excluded_pattern + for file in self._fs.glob(str(excluded_glob)): + if self._fs.isdir(file): + rejected_dirs.add(_Path(file)) + else: + rejected_files.add(_Path(file)) + + file_refs: List[str] = [] + if self._recursive: + file_refs = self._fs.glob(str(input_dir) + "/**/*") + else: + file_refs = self._fs.glob(str(input_dir) + "/*") + + for ref in file_refs: + ref = _Path(ref) + is_dir = self._fs.isdir(ref) + skip_hidden = self._exclude_hidden and self._is_hidden(ref) + skip_bad_exts = (self._required_exts is not None and ref.suffix not in self._required_exts) + skip_excluded = ref in rejected_files + if not skip_excluded: + if is_dir: + ref_parent_dir = ref + else: + ref_parent_dir = self._fs._parent(ref) + for rejected_dir in rejected_dirs: + if str(ref_parent_dir).startswith(str(rejected_dir)): + skip_excluded = True + LOG.warning(f"Skipping {ref} because it in parent dir " + f"{ref_parent_dir} which is in {rejected_dir}.") + break + + if is_dir or skip_hidden or skip_bad_exts or skip_excluded: + continue + else: + all_files.add(ref) + + new_input_files = sorted(all_files) + + if len(new_input_files) == 0: + raise ValueError(f"No files found in {input_dir}.") + if self._num_files_limit is not None and self._num_files_limit > 0: + new_input_files = new_input_files[0: self._num_files_limit] + + LOG.debug(f"[SimpleDirectoryReader] Total files add: {len(new_input_files)}") + + return new_input_files + + def _is_hidden(self, path: Path) -> bool: + return any(part.startswith(".") and part not in [".", ".."] for part in path.parts) + + def _exclude_metadata(self, documents: List[DocNode]) -> List[DocNode]: + for doc in documents: + doc.excluded_embed_metadata_keys.extend( + ["file_name", "file_type", "file_size", "creation_date", + "last_modified_date", "last_accessed_date"]) + doc.excluded_llm_metadata_keys.extend( + ["file_name", "file_type", "file_size", "creation_date", + "last_modified_date", "last_accessed_date"]) + return documents + + @staticmethod + def load_file(input_file: Path, file_metadata: Callable[[str], Dict], filename_as_id: bool = False, + encoding: str = "utf-8", fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + file_reader_cls = SimpleDirectoryReader._registered_file_reader_cls + file_reader_suffix = list(file_reader_cls.keys()) + metadata: Optional[dict] = None + documents: List[DocNode] = [] + + if file_metadata is not None: + metadata = file_metadata(str(input_file)) + + file_suffix = input_file.suffix.lower() + if file_suffix in file_reader_suffix: + reader = file_reader_cls[file_suffix]() + + kwargs = {"extra_info": metadata} + if fs and not is_default_fs(fs): + kwargs['fs'] = fs + docs = reader.load_data(input_file, **kwargs) + + if filename_as_id: + for i, doc in enumerate(docs): + doc.uid = f"{input_file!s}_index_{i}" + + documents.extend(docs) + else: + fs = fs or get_default_fs() + with fs.open(input_file, encoding=encoding) as f: + data = f.read().decode(encoding) + + doc = DocNode(text=data, metadata=metadata or {}) + if filename_as_id: + doc.uid = str(input_file) + + documents.append(doc) + + return documents + + def load_data(self, show_progress: bool = False, num_workers: Optional[int] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + documents = [] + + fs = fs or self._fs + process_file = self._input_files + + if num_workers and num_workers > 1: + if num_workers > multiprocessing.cpu_count(): + LOG.warning("Specified num_workers exceed number of CPUs in the system. " + "Setting `num_workers` down to the maximum CPU count.") + with multiprocessing.get_context("spawn").Pool(num_workers) as p: + results = p.starmap(SimpleDirectoryReader.load_file, + zip(process_file, repeat(self._file_metadata), repeat(self._filename_as_id), + repeat(self._encoding), repeat(self._fs))) + documents = reduce(lambda x, y: x + y, results) + else: + if show_progress: + process_file = tqdm(self._input_files, desc="Loading files", unit="file") + for input_file in process_file: + documents.extend( + SimpleDirectoryReader.load_file(input_file=input_file, file_metadata=self._file_metadata, + filename_as_id=self._filename_as_id, encoding=self._encoding, + fs=self._fs)) + + return self._exclude_metadata(documents) + + @classmethod + def register_file_reader(cls, target_class: ReaderBase): + clazz = target_class.__name__.casefold().split("reader")[0] + cls._registered_file_reader_cls["." + clazz] = target_class + return target_class + + @classmethod + def get_registry(cls): + return cls._registered_file_reader_cls diff --git a/lazyllm/tools/rag/data_loaders.py b/lazyllm/tools/rag/data_loaders.py index e8135ae8..e91b48ea 100644 --- a/lazyllm/tools/rag/data_loaders.py +++ b/lazyllm/tools/rag/data_loaders.py @@ -1,7 +1,7 @@ from typing import List, Optional from .store import DocNode, LAZY_ROOT_NAME from lazyllm import LOG - +from .dataReader import SimpleDirectoryReader class DirectoryReader: def __init__(self, input_files: List[str]) -> None: @@ -9,20 +9,10 @@ def __init__(self, input_files: List[str]) -> None: def load_data(self, input_files: Optional[List[str]] = None) -> List[DocNode]: input_files = input_files or self._input_files - from llama_index.core import SimpleDirectoryReader LOG.info(f"DirectoryReader loads data, input files: {input_files}") reader = SimpleDirectoryReader(input_files=input_files) - nodes: List[DocNode] = [] - for doc in reader.load_data(): - node = DocNode( - text=doc.text, - group=LAZY_ROOT_NAME, - ) - node.metadata = doc.metadata - node.excluded_embed_metadata_keys = doc.excluded_embed_metadata_keys - node.excluded_llm_metadata_keys = doc.excluded_llm_metadata_keys - nodes.append(node) + nodes = [doc for doc in reader.load_data() if setattr(doc, 'group', LAZY_ROOT_NAME) is None] if not nodes: LOG.warning( f"No nodes load from path {self.input_files}, please check your data path." diff --git a/lazyllm/tools/rag/readers/__init__.py b/lazyllm/tools/rag/readers/__init__.py new file mode 100644 index 00000000..ff3182b6 --- /dev/null +++ b/lazyllm/tools/rag/readers/__init__.py @@ -0,0 +1,30 @@ +from .readerBase import LazyLLMReaderBase as ReaderBase, get_default_fs, is_default_fs +from .pdfReader import PDFReader +from .docxReader import DocxReader +from .hwpReader import HWPReader +from .pptxReader import PPTXReader +from .imageReader import ImageReader +from .ipynbReader import IPYNBReader +from .epubReader import EpubReader +from .markdownReader import MarkdownReader +from .mboxreader import MboxReader +from .pandasReader import PandasCSVReader, PandasExcelReader +from .videoAudioReader import VideoAudioReader + +__all__ = [ + "ReaderBase", + "get_default_fs", + "is_default_fs", + "PDFReader", + "DocxReader", + "HWPReader", + "PPTXReader", + "ImageReader", + "IPYNBReader", + "EpubReader", + "MarkdownReader", + "MboxReader", + "PandasCSVReader", + "PandasExcelReader", + "VideoAudioReader", +] diff --git a/lazyllm/tools/rag/readers/docxReader.py b/lazyllm/tools/rag/readers/docxReader.py new file mode 100644 index 00000000..308a29cb --- /dev/null +++ b/lazyllm/tools/rag/readers/docxReader.py @@ -0,0 +1,25 @@ +from pathlib import Path +from fsspec import AbstractFileSystem +from typing import Dict, Optional, List + +from .readerBase import LazyLLMReaderBase +from ..store import DocNode + +class DocxReader(LazyLLMReaderBase): + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + if not isinstance(file, Path): file = Path(file) + try: + import docx2txt + except ImportError: + raise ImportError("docx2txt is required to read Microsoft Word files: `pip install docx2txt`") + + if fs: + with fs.open(file) as f: + text = docx2txt.process(f) + else: + text = docx2txt.process(file) + metadata = {"file_name": file.name} + if extra_info is not None: metadata.update(extra_info) + + return [DocNode(text=text, metadata=metadata)] diff --git a/lazyllm/tools/rag/readers/epubReader.py b/lazyllm/tools/rag/readers/epubReader.py new file mode 100644 index 00000000..699993a8 --- /dev/null +++ b/lazyllm/tools/rag/readers/epubReader.py @@ -0,0 +1,33 @@ +from pathlib import Path +from typing import Dict, List, Optional +from fsspec import AbstractFileSystem + +from .readerBase import LazyLLMReaderBase +from ..store import DocNode +from lazyllm import LOG + +class EpubReader(LazyLLMReaderBase): + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + try: + import ebooklib + import html2text + from ebooklib import epub + except ImportError: + raise ImportError("Please install extra dependencies that are required " + "for the EpubReader: `pip install EbookLib html2text`") + + if not isinstance(file, Path): file = Path(file) + + if fs: + LOG.warning("fs was specified but EpubReader doesn't support loading from " + "fsspec filesystems. Will load from local filesystem instead.") + + text_list = [] + book = epub.read_epub(file, options={"ignore_ncs": True}) + + for item in book.get_items(): + if item.get_type() == ebooklib.ITEM_DOCUMENT: + text_list.append(html2text.html2text(item.get_content().decode("utf-8"))) + text = "\n".join(text_list) + return [DocNode(text=text, metadata=extra_info or {})] diff --git a/lazyllm/tools/rag/readers/hwpReader.py b/lazyllm/tools/rag/readers/hwpReader.py new file mode 100644 index 00000000..916b5197 --- /dev/null +++ b/lazyllm/tools/rag/readers/hwpReader.py @@ -0,0 +1,92 @@ +from fsspec import AbstractFileSystem +from pathlib import Path +import struct +from typing import Optional, Dict, List, Any +import zlib + +from .readerBase import LazyLLMReaderBase +from ..store import DocNode +from lazyllm import LOG + +class HWPReader(LazyLLMReaderBase): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._FILE_HEADER_SECTION = "FileHeader" + self._HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation" + self._SECTION_NAME_LENGTH = len("Section") + self._BODYTEXT_SECTION = "BodyText" + self._HWP_TEXT_TAGS = [67] + self._text = "" + + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + try: + import olefile + except ImportError: + raise ImportError("olefile is required to read hwp files: `pip install olefile`") + + if fs: + LOG.warning("fs was specified but HWPReader doesn't support loading from " + "fsspec filesystems. Will load from local filesystem instead.") + + if not isinstance(file, Path): file = Path(file) + + load_file = olefile.OleFileIO(file) + file_dir = load_file.listdir() + if self._is_valid(file_dir) is False: raise Exception("Not Valid HwpFile") + + result_text = self._get_text(load_file, file_dir) + return [DocNode(text=result_text, metadata=extra_info or {})] + + def _is_valid(self, dirs: List[str]) -> bool: + if [self._FILE_HEADER_SECTION] not in dirs: return False + return [self._HWP_SUMMARY_SECTION] in dirs + + def _text_to_docnode(self, text: str, extra_info: Optional[Dict] = None) -> DocNode: + return DocNode(text=text, metadata=extra_info or {}) + + def _get_text(self, load_file: Any, file_dirs: List[str]) -> str: + sections = self._get_body_sections(file_dirs) + text = "" + for section in sections: + text += self._get_text_from_section(load_file, section) + text += "\n" + + self._text = text + return self._text + + def _get_body_sections(self, dirs: List[str]) -> List[str]: + m = [] + for d in dirs: + if d[0] == self._BODYTEXT_SECTION: + m.append(int(d[1][self._SECTION_NAME_LENGTH:])) + + return ["BodyText/Section" + str(x) for x in sorted(m)] + + def _is_compressed(self, load_file: Any) -> bool: + header = load_file.openstream("FileHeader") + header_data = header.read() + return (header_data[36] & 1) == 1 + + def _get_text_from_section(self, load_file: Any, section: str) -> str: + bodytext = load_file.openstream(section) + data = bodytext.read() + + unpacked_data = (zlib.decompress(data, -15) if self._is_compressed(load_file) else data) + size = len(unpacked_data) + + i = 0 + text = "" + while i < size: + header = struct.unpack_from("> 10) & 0x3FF + rec_len = (header >> 20) & 0xFFF + + if rec_type in self._HWP_TEXT_TAGS: + rec_data = unpacked_data[i + 4: i + 4 + rec_len] + text += rec_data.decode("utf-16") + text += "\n" + + i += 4 + rec_len + return text diff --git a/lazyllm/tools/rag/readers/imageReader.py b/lazyllm/tools/rag/readers/imageReader.py new file mode 100644 index 00000000..7a085d77 --- /dev/null +++ b/lazyllm/tools/rag/readers/imageReader.py @@ -0,0 +1,100 @@ +import base64 +import re +from io import BytesIO +from pathlib import Path +from typing import Dict, List, Optional, Any, cast +from fsspec import AbstractFileSystem +from PIL import Image + +from .readerBase import LazyLLMReaderBase, infer_torch_device +from ..store import DocNode + +def img_2_b64(image: Image, format: str = "JPEG") -> str: + buff = BytesIO() + image.save(buff, format=format) + return cast(str, base64.b64encode(buff.getvalue())) + +def b64_2_img(data: str) -> Image: + buff = BytesIO(base64.b64decode(data)) + return Image.open(buff) + +class ImageReader(LazyLLMReaderBase): + def __init__(self, parser_config: Optional[Dict] = None, keep_image: bool = False, parse_text: bool = False, + text_type: str = "text", pytesseract_model_kwargs: Dict[str, Any] = {}) -> None: + self._text_type = text_type + if parser_config is None and parse_text: + if text_type == "plain_text": + try: + import pytesseract + except ImportError: + raise ImportError("Please install extra dependencies that are required for the ImageReader " + "when text_type is 'plain_text': `pip install pytesseract`") + + processor = None + model = pytesseract + else: + try: + import sentencepiece # noqa + import torch # noqa + from PIL import Image # noqa + from transformers import DonutProcessor, VisionEncoderDecoderModel + except ImportError: + raise ImportError("Please install extra dependencies that are required for the " + "ImageCaptionReader: `pip install torch transformers sentencepiece Pillow`") + + processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") + model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") + parser_config = {'processor': processor, 'model': model} + + self._parser_config = parser_config + self._keep_image = keep_image + self._parse_text = parse_text + self._pytesseract_model_kwargs = pytesseract_model_kwargs + + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + if not isinstance(file, Path): file = Path(file) + + if fs: + with fs.open(path=file) as f: + image = Image.open(f.read()) + else: + image = Image.open(file) + + if image.mode != "RGB": image = image.convert("RGB") + + image_str: Optional[str] = None # noqa + if self._keep_image: image_str = img_2_b64(image) # noqa + + text_str: str = "" + if self._parse_text: + assert self._parser_config is not None + model = self._parser_config["model"] + processor = self._parser_config["processor"] + + if processor: + device = infer_torch_device() + model.to(device) + + task_prompt = "" + decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, + return_tensors='pt').input_ids + pixel_values = processor(image, return_tensors='pt').pixel_values + + output = model.generate(pixel_values.to(device), decoder_input_ids=decoder_input_ids.to(device), + max_length=model.decoder.config.max_position_embeddings, early_stopping=True, + pad_token_id=processor.tokenizer.pad_token_id, + eos_token_id=processor.tokenizer.eos_token_id, use_cache=True, num_beams=3, + bad_words_ids=[[processor.tokenizer.unk_token_id]], + return_dict_in_generate=True) + + sequence = processor.batch_decode(output.sequences)[0] + sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") + text_str = re.sub(r"<.*?>", "", sequence, count=1).strip() + else: + import pytesseract + + model = cast(pytesseract, self._parser_config['model']) + text_str = model.image_to_string(image, **self._pytesseract_model_kwargs) + + return [DocNode(text=text_str, metadata=extra_info or {})] diff --git a/lazyllm/tools/rag/readers/ipynbReader.py b/lazyllm/tools/rag/readers/ipynbReader.py new file mode 100644 index 00000000..d08bd992 --- /dev/null +++ b/lazyllm/tools/rag/readers/ipynbReader.py @@ -0,0 +1,36 @@ +import re +from pathlib import Path +from typing import Dict, List, Optional +from fsspec import AbstractFileSystem + +from .readerBase import LazyLLMReaderBase +from ..store import DocNode + +class IPYNBReader(LazyLLMReaderBase): + def __init__(self, parser_config: Optional[Dict] = None, concatenate: bool = False): + self._parser_config = parser_config + self._concatenate = concatenate + + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + if not isinstance(file, Path): file = Path(file) + + if file.name.endswith(".ipynb"): + try: + import nbconvert + except ImportError: + raise ImportError("Please install nbconvert `pip install nbconvert`") + + if fs: + with fs.open(file, encoding='utf-8') as f: + doc_str = nbconvert.exporters.ScriptExporter().from_file(f)[0] + else: + doc_str = nbconvert.exporters.ScriptExporter().from_file(file)[0] + + splits = re.split(r"In\[\d+\]:", doc_str) + splits.pop(0) + + if self._concatenate: docs = [DocNode(text="\n\n".join(splits), metadata=extra_info or {})] + else: docs = [DocNode(text=s, metadata=extra_info or {}) for s in splits] + + return docs diff --git a/lazyllm/tools/rag/readers/markdownReader.py b/lazyllm/tools/rag/readers/markdownReader.py new file mode 100644 index 00000000..1a03a9f5 --- /dev/null +++ b/lazyllm/tools/rag/readers/markdownReader.py @@ -0,0 +1,67 @@ +import re +from pathlib import Path +from fsspec import AbstractFileSystem +from fsspec.implementations.local import LocalFileSystem +from typing import Dict, List, Optional, Tuple + +from .readerBase import LazyLLMReaderBase +from ..store import DocNode + +class MarkdownReader(LazyLLMReaderBase): + def __init__(self, *args, remove_hyperlinks: bool = True, remove_images: bool = True, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._remove_hyperlinks = remove_hyperlinks + self._remove_images = remove_images + + def _markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: + markdown_tups: List[Tuple[Optional[str], str]] = [] + lines = markdown_text.split("\n") + + current_header = None + current_lines = [] + in_code_block = False + + for line in lines: + if line.startswith("```"): in_code_block = not in_code_block + + header_match = re.match(r"^#+\s", line) + if not in_code_block and header_match: + if current_header is not None or len(current_lines) > 0: + markdown_tups.append((current_header, "\n".join(current_lines))) + current_header = line + current_lines.clear() + else: + current_lines.append(line) + + markdown_tups.append((current_header, "\n".join(current_lines))) + return [(key if key is None else re.sub(r"#", "", key).strip(), re.sub(r"<.*?>", "", value),) + for key, value in markdown_tups] + + def remove_images(self, content: str) -> str: + pattern = r"!{1}\[\[(.*)\]\]" + return re.sub(pattern, "", content) + + def remove_hyperlinks(self, content: str) -> str: + pattern = r"\[(.*)\]\((.*)\)" + return re.sub(pattern, r"\1", content) + + def _parse_tups(self, filepath: Path, errors: str = "ignore", + fs: Optional[AbstractFileSystem] = None) -> List[Tuple[Optional[str], str]]: + fs = fs or LocalFileSystem() + + with fs.open(filepath, encoding="utf-8") as f: + content = f.read().decode(encoding="utf-8") + + if self._remove_hyperlinks: content = self.remove_hyperlinks(content) + if self._remove_images: content = self.remove_images(content) + return self._markdown_to_tups(content) + + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + if not isinstance(file, Path): file = Path(file) + + tups = self._parse_tups(file, fs=fs) + results = [DocNode(text=value if header is None else f"\n\n{header}\n{value}", metadata=extra_info or {}) + for header, value in tups] + + return results diff --git a/lazyllm/tools/rag/readers/mboxreader.py b/lazyllm/tools/rag/readers/mboxreader.py new file mode 100644 index 00000000..93537406 --- /dev/null +++ b/lazyllm/tools/rag/readers/mboxreader.py @@ -0,0 +1,67 @@ +from pathlib import Path +from typing import Dict, List, Optional +from fsspec import AbstractFileSystem + +from .readerBase import LazyLLMReaderBase +from ..store import DocNode +from lazyllm import LOG + +class MboxReader(LazyLLMReaderBase): + DEFAULT_MESSAGE_FORMAT: str = ( + "Date: {_date}\n" + "From: {_from}\n" + "To: {_to}\n" + "Subject: {_subject}\n" + "Content: {_content}" + ) + + def __init__(self, *args, max_count: int = 0, message_format: str = DEFAULT_MESSAGE_FORMAT, **kwargs) -> None: + try: + from bs4 import BeautifulSoup # noqa + except ImportError: + raise ImportError("`BeautifulSoup` package not found: `pip install beautifulsoup4`") + + super().__init__(*args, **kwargs) + self._max_count = max_count + self._message_format = message_format + + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + import mailbox + from email.parser import BytesParser + from email.policy import default + from bs4 import BeautifulSoup + + if fs: + LOG.warning("fs was specified but MboxReader doesn't support loading from " + "fsspec filesystems. Will load from local filesystem instead.") + + i = 0 + results: List[str] = [] + bytes_parser = BytesParser(policy=default).parse + mbox = mailbox.mbox(file, factory=bytes_parser) + + for _, _msg in enumerate(mbox): + try: + msg: mailbox.mboxMessage = _msg + if msg.is_multipart(): + for part in msg.walk(): + ctype = part.get_content_type() + cdispo = str(part.get("Content-Disposition")) + if ctype == "text/plain" and "attachment" not in cdispo: + content = part.get_payload(decode=True) + break + else: + content = msg.get_payload(decode=True) + + soup = BeautifulSoup(content) + stripped_content = " ".join(soup.get_text().split()) + msg_string = self._message_format.format(_date=msg["date"], _from=msg["from"], _to=msg["to"], + _subject=msg["subject"], _content=stripped_content) + results.append(msg_string) + except Exception as e: + LOG.warning(f"Failed to parse message:\n{_msg}\n with exception {e}") + + i += 1 + if self._max_count > 0 and i >= self._max_count: break + return [DocNode(text=result, metadata=extra_info or {}) for result in results] diff --git a/lazyllm/tools/rag/readers/pandasReader.py b/lazyllm/tools/rag/readers/pandasReader.py new file mode 100644 index 00000000..69d2c75d --- /dev/null +++ b/lazyllm/tools/rag/readers/pandasReader.py @@ -0,0 +1,71 @@ +from pathlib import Path +from typing import Dict, List, Optional +from fsspec import AbstractFileSystem +import importlib +import pandas as pd + +from .readerBase import LazyLLMReaderBase +from ..store import DocNode + +class PandasCSVReader(LazyLLMReaderBase): + def __init__(self, *args, concat_rows: bool = True, col_joiner: str = ", ", row_joiner: str = "\n", + pandas_config: dict = {}, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._concat_rows = concat_rows + self._col_joiner = col_joiner + self._row_joiner = row_joiner + self._pandas_config = pandas_config + + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + if not isinstance(file, Path): file = Path(file) + + if fs: + with fs.open(file) as f: + df = pd.read_csv(f, **self._pandas_config) + else: + df = pd.read_csv(file, **self._pandas_config) + + text_list = df.apply(lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1).tolist() + + if self._concat_rows: return [DocNode(text=(self._row_joiner).join(text_list), metadata=extra_info or {})] + else: return [DocNode(text=text, metadata=extra_info or {}) for text in text_list] + +class PandasExcelReader(LazyLLMReaderBase): + def __init__(self, *args, concat_rows: bool = True, sheet_name: Optional[str] = None, + pandas_config: dict = {}, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._concat_rows = concat_rows + self._sheet_name = sheet_name + self._pandas_config = pandas_config + + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + openpyxl_spec = importlib.util.find_spec("openpyxl") + if openpyxl_spec is not None: pass + else: raise ImportError("Please install openpyxl to read Excel files. " + "You can install it with `pip install openpyxl`") + + if not isinstance(file, Path): file = Path(file) + if fs: + with fs.open(file) as f: + dfs = pd.read_excel(f, self._sheet_name, **self._pandas_config) + else: + dfs = pd.read_excel(file, self._sheet_name, **self._pandas_config) + + documents = [] + if isinstance(dfs, pd.DataFrame): + df = dfs.fillna("") + text_list = (df.astype(str).apply(lambda row: " ".join(row.values), axis=1).tolist()) + + if self._concat_rows: documents.append(DocNode(text="\n".join(text_list), metadata=extra_info or {})) + else: documents.extend([DocNode(text=text, metadata=extra_info or {}) for text in text_list]) + else: + for df in dfs.values(): + df = df.fillna("") + text_list = (df.astype(str).apply(lambda row: " ".join(row), axis=1).tolist()) + + if self._concat_rows: documents.append(DocNode(text="\n".join(text_list), metadata=extra_info or {})) + else: documents.extend([DocNode(text=text, metadata=extra_info or {}) for text in text_list]) + + return documents diff --git a/lazyllm/tools/rag/readers/pdfReader.py b/lazyllm/tools/rag/readers/pdfReader.py new file mode 100644 index 00000000..315cdca8 --- /dev/null +++ b/lazyllm/tools/rag/readers/pdfReader.py @@ -0,0 +1,44 @@ +import io +from tenacity import retry, stop_after_attempt +from pathlib import Path +from typing import Dict, List, Optional +from fsspec import AbstractFileSystem + +from .readerBase import LazyLLMReaderBase, get_default_fs, is_default_fs +from ..store import DocNode + +RETRY_TIMES = 3 + +class PDFReader(LazyLLMReaderBase): + def __init__(self, return_full_document: bool = False) -> None: + self._return_full_document = return_full_document + + @retry(stop=stop_after_attempt(RETRY_TIMES)) + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + if not isinstance(file, Path): file = Path(file) + + try: + import pypdf + except ImportError: + raise ImportError("pypdf is required to read PDF files: `pip install pypdf`") + + fs = fs or get_default_fs() + with fs.open(file, 'rb') as fp: + stream = fp if is_default_fs(fs) else io.BytesIO(fp.read()) + pdf = pypdf.PdfReader(stream) + num_pages = len(pdf.pages) + docs = [] + if self._return_full_document: + metadata = {"file_name": file.name} + if extra_info is not None: metadata.update(extra_info) + text = "\n".join(pdf.pages[page].extract_text() for page in range(num_pages)) + docs.append(DocNode(text=text, metadata=metadata)) + else: + for page in range(num_pages): + page_text = pdf.pages[page].extract_text() + page_label = pdf.page_labels[page] + metadata = {"page_label": page_label, "file_name": file.name} + if extra_info is not None: metadata.update(extra_info) + docs.append(DocNode(text=page_text, metadata=metadata)) + return docs diff --git a/lazyllm/tools/rag/readers/pptxReader.py b/lazyllm/tools/rag/readers/pptxReader.py new file mode 100644 index 00000000..fb9ef304 --- /dev/null +++ b/lazyllm/tools/rag/readers/pptxReader.py @@ -0,0 +1,80 @@ +import os +import tempfile +from fsspec import AbstractFileSystem +from pathlib import Path +from typing import Optional, Dict, List + +from .readerBase import LazyLLMReaderBase, infer_torch_device +from ..store import DocNode + +class PPTXReader(LazyLLMReaderBase): + def __init__(self) -> None: + try: + import torch # noqa + from PIL import Image # noqa + from pptx import Presentation # noqa + from transformers import (AutoTokenizer, VisionEncoderDecoderModel, ViTFeatureExtractor,) + except ImportError: + raise ImportError("Please install extra dependencies that are required for the " + "PPTXReader: `pip install torch transformers python-pptx Pillow`") + + model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") + feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") + tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") + + self._parser_config = {"feature_extractor": feature_extractor, "model": model, "tokenizer": tokenizer} + + def _caption_image(self, tmp_image_file: str) -> str: + from PIL import Image + + model = self._parser_config['model'] + feature_extractor = self._parser_config['feature_extractor'] + tokenizer = self._parser_config['tokenizer'] + + device = infer_torch_device() + model.to(device) + + max_length = 16 + num_beams = 4 + gen_kwargs = {"max_length": max_length, "num_beams": num_beams} + + i_image = Image.open(tmp_image_file) + if i_image.mode != "RGB": i_image = i_image.convert(mode="RGB") + + pixel_values = feature_extractor(images=[i_image], return_tensors="pt").pixel_values + pixel_values = pixel_values.to(device) + + output_ids = model.generate(pixel_values, **gen_kwargs) + + preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + return preds[0].strip() + + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + from pptx import Presentation + + if not isinstance(file, Path): file = Path(file) + + if fs: + with fs.open(file) as f: + presentation = Presentation(f) + else: + presentation = Presentation(file) + + result = "" + for i, slide in enumerate(presentation.slides): + result += f"\n\nSlide #{i}: \n" + for shape in slide.shapes: + if hasattr(shape, "image"): + image = shape.image + image_bytes = image.blob + f = tempfile.NamedTemporaryFile("wb", delete=False) + try: + f.write(image_bytes) + f.close() + result += f"\n Image: {self._caption_image(f.name)}\n\n" + finally: + os.unlink(f.name) + + if hasattr(shape, "text"): result += f"{shape.text}\n" + return [DocNode(text=result, metadata=extra_info or {})] diff --git a/lazyllm/tools/rag/readers/readerBase.py b/lazyllm/tools/rag/readers/readerBase.py new file mode 100644 index 00000000..14f2d0af --- /dev/null +++ b/lazyllm/tools/rag/readers/readerBase.py @@ -0,0 +1,31 @@ +import fsspec +from fsspec.implementations.local import LocalFileSystem +from typing import Iterable, List + +from ....common import LazyLLMRegisterMetaClass +from ..store import DocNode + +class LazyLLMReaderBase(metaclass=LazyLLMRegisterMetaClass): + def lazy_load_data(self, *args, **load_kwargs) -> Iterable[DocNode]: + raise NotImplementedError(f"{self.__class__.__name__} does not implement lazy_load_data method.") + + def load_data(self, *args, **load_kwargs) -> List[DocNode]: + return list(self.lazy_load_data(*args, **load_kwargs)) + + +def get_default_fs(): + return LocalFileSystem() + +def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool: + return isinstance(fs, LocalFileSystem) or not fs.auto_mkdir + +def infer_torch_device() -> str: + try: + has_cuda = torch.cuda.is_available() + except NameError: + import torch + has_cuda = torch.cuda.is_available() + + if has_cuda: return "cuda" + if torch.backends.mps.is_available(): return "mps" + return "cpu" diff --git a/lazyllm/tools/rag/readers/videoAudioReader.py b/lazyllm/tools/rag/readers/videoAudioReader.py new file mode 100644 index 00000000..f375d82e --- /dev/null +++ b/lazyllm/tools/rag/readers/videoAudioReader.py @@ -0,0 +1,48 @@ +from pathlib import Path +from typing import Dict, List, Optional, cast +from fsspec import AbstractFileSystem + +from .readerBase import LazyLLMReaderBase +from ..store import DocNode + +class VideoAudioReader(LazyLLMReaderBase): + def __init__(self, *args, model_version: str = "base", **kwargs) -> None: + super().__init__(*args, **kwargs) + self._model_version = model_version + + try: + import whisper + except ImportError: + raise ImportError("Please install OpenAI whisper model " + "`pip install git+https://github.com/openai/whisper.git` to use the model") + + model = whisper.load_model(self._model_version) + self._parser_config = {"model": model} + + def load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + import whisper + + if not isinstance(file, Path): file = Path(file) + + if file.name.endswith("mp4"): + try: + from pydub import AudioSegment + except ImportError: + raise ImportError("Please install pydub `pip install pydub`") + + if fs: + with fs.open(file, 'rb') as f: + video = AudioSegment.from_file(f, format="mp4") + else: + video = AudioSegment.from_file(file, format="mp4") + + audio = video.split_to_mono()[0] + file_str = str(file)[:-4] + ".mp3" + audio.export(file_str, format="mp3") + + model = cast(whisper.Whisper, self._parser_config["model"]) + result = model.transcribe(str(file)) + + transcript = result['text'] + return [DocNode(text=transcript, metadata=extra_info or {})] diff --git a/lazyllm/tools/rag/store.py b/lazyllm/tools/rag/store.py index 2b04401c..762d28dc 100644 --- a/lazyllm/tools/rag/store.py +++ b/lazyllm/tools/rag/store.py @@ -28,12 +28,13 @@ def __init__( group: Optional[str] = None, embedding: Optional[List[float]] = None, parent: Optional["DocNode"] = None, + metadata: Optional[Dict[str, Any]] = {}, ) -> None: self.uid: str = uid if uid else str(uuid.uuid4()) self.text: Optional[str] = text self.group: Optional[str] = group self.embedding: Optional[List[float]] = embedding or None - self._metadata: Dict[str, Any] = {} + self._metadata: Dict[str, Any] = metadata # Metadata keys that are excluded from text for the embed model. self._excluded_embed_metadata_keys: List[str] = [] # Metadata keys that are excluded from text for the LLM. From aab9c478aeea68b243e163763ae73f25c504e85a Mon Sep 17 00:00:00 2001 From: wangjian Date: Thu, 12 Sep 2024 15:07:05 +0800 Subject: [PATCH 03/11] optimizing reader register process --- lazyllm/tools/rag/dataReader.py | 149 +++++++++--------- lazyllm/tools/rag/data_loaders.py | 12 +- lazyllm/tools/rag/doc_impl.py | 4 +- lazyllm/tools/rag/document.py | 31 +++- lazyllm/tools/rag/group_doc.py | 6 +- lazyllm/tools/rag/readers/docxReader.py | 4 +- lazyllm/tools/rag/readers/epubReader.py | 4 +- lazyllm/tools/rag/readers/hwpReader.py | 8 +- lazyllm/tools/rag/readers/imageReader.py | 12 +- lazyllm/tools/rag/readers/ipynbReader.py | 7 +- lazyllm/tools/rag/readers/markdownReader.py | 8 +- lazyllm/tools/rag/readers/mboxreader.py | 9 +- lazyllm/tools/rag/readers/pandasReader.py | 24 +-- lazyllm/tools/rag/readers/pdfReader.py | 7 +- lazyllm/tools/rag/readers/pptxReader.py | 7 +- lazyllm/tools/rag/readers/readerBase.py | 15 +- lazyllm/tools/rag/readers/videoAudioReader.py | 8 +- lazyllm/tools/rag/store.py | 4 +- tests/basic_tests/test_rag_reader.py | 44 ++++++ 19 files changed, 226 insertions(+), 137 deletions(-) create mode 100644 tests/basic_tests/test_rag_reader.py diff --git a/lazyllm/tools/rag/dataReader.py b/lazyllm/tools/rag/dataReader.py index f4e743ee..1b05a879 100644 --- a/lazyllm/tools/rag/dataReader.py +++ b/lazyllm/tools/rag/dataReader.py @@ -1,16 +1,17 @@ import os import mimetypes import multiprocessing +import fnmatch from tqdm import tqdm from datetime import datetime from functools import reduce from itertools import repeat -from typing import Dict, Type, Optional, List, Callable -from pathlib import Path, PurePosixPath +from typing import Dict, Optional, List, Callable +from pathlib import Path, PurePosixPath, PurePath from fsspec import AbstractFileSystem from lazyllm import ModuleBase, LOG from .store import DocNode -from .readers import (ReaderBase, PDFReader, DocxReader, HWPReader, PPTXReader, ImageReader, IPYNBReader, +from .readers import (PDFReader, DocxReader, HWPReader, PPTXReader, ImageReader, IPYNBReader, EpubReader, MarkdownReader, MboxReader, PandasCSVReader, PandasExcelReader, VideoAudioReader, get_default_fs, is_default_fs) @@ -50,34 +51,12 @@ def __call__(self, file_path: str) -> Dict: return {meta_key: meta_value for meta_key, meta_value in default_meta.items() if meta_value is not None} class SimpleDirectoryReader(ModuleBase): - _registered_file_reader_cls: Dict[str, Type[ReaderBase]] = { - ".pdf": PDFReader, - ".docx": DocxReader, - ".hwp": HWPReader, - ".pptx": PPTXReader, - ".ppt": PPTXReader, - ".pptm": PPTXReader, - ".gif": ImageReader, - ".jpeg": ImageReader, - ".jpg": ImageReader, - ".png": ImageReader, - ".webp": ImageReader, - ".ipynb": IPYNBReader, - ".epub": EpubReader, - ".md": MarkdownReader, - ".mbox": MboxReader, - ".csv": PandasCSVReader, - ".xls": PandasExcelReader, - ".xlsx": PandasExcelReader, - ".mp3": VideoAudioReader, - ".mp4": VideoAudioReader, - } - def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List] = None, exclude: Optional[List] = None, exclude_hidden: bool = True, recursive: bool = False, encoding: str = "utf-8", filename_as_id: bool = False, required_exts: Optional[List[str]] = None, - fs: Optional[AbstractFileSystem] = None, file_metadata: Optional[Callable[[str], Dict]] = None, - num_files_limit: Optional[int] = None, return_trace: bool = False) -> None: + file_extractor: Optional[Dict[str, Callable]] = None, fs: Optional[AbstractFileSystem] = None, + file_metadata: Optional[Callable[[str], Dict]] = None, num_files_limit: Optional[int] = None, + return_trace: bool = False) -> None: super().__init__(return_trace=return_trace) if not input_dir and not input_files: @@ -91,21 +70,26 @@ def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List] self._exclude_hidden = exclude_hidden self._required_exts = required_exts self._num_files_limit = num_files_limit - _Path = Path if is_default_fs(self._fs) else PurePosixPath + self._Path = Path if is_default_fs(self._fs) else PurePosixPath if input_files: self._input_files = [] for path in input_files: if not self._fs.isfile(path): raise ValueError(f"File {path} does not exist.") - input_file = _Path(path) + input_file = self._Path(path) self._input_files.append(input_file) elif input_dir: if not self._fs.isdir(input_dir): raise ValueError(f"Directory {input_dir} does not exist.") - self._input_dir = _Path(input_dir) + self._input_dir = self._Path(input_dir) self._input_files = self._add_files(self._input_dir) + if file_extractor is not None: + self._file_extractor = file_extractor + else: + self._file_extractor = {} + self._file_metadata = file_metadata or _DefaultFileMetadataFunc(self._fs) self._filename_as_id = filename_as_id @@ -113,19 +97,18 @@ def _add_files(self, input_dir: Path) -> List[Path]: # noqa: C901 all_files = set() rejected_files = set() rejected_dirs = set() - _Path = Path if is_default_fs(self._fs) else PurePosixPath if self._exclude is not None: for excluded_pattern in self._exclude: if self._recursive: - excluded_glob = _Path(input_dir) / _Path("**") / excluded_pattern + excluded_glob = self._Path(input_dir) / self._Path("**") / excluded_pattern else: - excluded_glob = _Path(input_dir) / excluded_pattern + excluded_glob = self._Path(input_dir) / excluded_pattern for file in self._fs.glob(str(excluded_glob)): if self._fs.isdir(file): - rejected_dirs.add(_Path(file)) + rejected_dirs.add(self._Path(file)) else: - rejected_files.add(_Path(file)) + rejected_files.add(self._Path(file)) file_refs: List[str] = [] if self._recursive: @@ -134,7 +117,7 @@ def _add_files(self, input_dir: Path) -> List[Path]: # noqa: C901 file_refs = self._fs.glob(str(input_dir) + "/*") for ref in file_refs: - ref = _Path(ref) + ref = self._Path(ref) is_dir = self._fs.isdir(ref) skip_hidden = self._exclude_hidden and self._is_hidden(ref) skip_bad_exts = (self._required_exts is not None and ref.suffix not in self._required_exts) @@ -165,6 +148,7 @@ def _add_files(self, input_dir: Path) -> List[Path]: # noqa: C901 LOG.debug(f"[SimpleDirectoryReader] Total files add: {len(new_input_files)}") + LOG.info(f"input_files: {new_input_files}") return new_input_files def _is_hidden(self, path: Path) -> bool: @@ -181,45 +165,48 @@ def _exclude_metadata(self, documents: List[DocNode]) -> List[DocNode]: return documents @staticmethod - def load_file(input_file: Path, file_metadata: Callable[[str], Dict], filename_as_id: bool = False, - encoding: str = "utf-8", fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: - file_reader_cls = SimpleDirectoryReader._registered_file_reader_cls - file_reader_suffix = list(file_reader_cls.keys()) + def load_file(input_file: Path, file_metadata: Callable[[str], Dict], file_extractor: Dict[str, Callable], + filename_as_id: bool = False, encoding: str = "utf-8", _Path: PurePath = Path, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + default_file_reader = SimpleDirectoryReader._loading_default_file_reader() + default_file_reader_patterns = list(default_file_reader.keys()) metadata: Optional[dict] = None documents: List[DocNode] = [] - if file_metadata is not None: - metadata = file_metadata(str(input_file)) + if file_metadata is not None: metadata = file_metadata(str(input_file)) - file_suffix = input_file.suffix.lower() - if file_suffix in file_reader_suffix: - reader = file_reader_cls[file_suffix]() + file_reader_patterns = list(file_extractor.keys()) + file_reader_patterns.extend(default_file_reader_patterns) - kwargs = {"extra_info": metadata} - if fs and not is_default_fs(fs): - kwargs['fs'] = fs - docs = reader.load_data(input_file, **kwargs) + for pattern in file_reader_patterns: + pt = str(_Path(pattern)) + match_pattern = pt if pt.startswith("*") else os.path.join(str(_Path.cwd()), pt) + if fnmatch.fnmatch(input_file, match_pattern): + reader = file_extractor[pattern] if pattern not in default_file_reader_patterns \ + else default_file_reader[pattern] - if filename_as_id: - for i, doc in enumerate(docs): - doc.uid = f"{input_file!s}_index_{i}" + kwargs = {"extra_info": metadata} + if fs and not is_default_fs(fs): kwargs['fs'] = fs + docs = reader(input_file, **kwargs) - documents.extend(docs) + if filename_as_id: + for i, doc in enumerate(docs): + doc.uid = f"{input_file!s}_index_{i}" + documents.extend(docs) + break else: fs = fs or get_default_fs() with fs.open(input_file, encoding=encoding) as f: data = f.read().decode(encoding) doc = DocNode(text=data, metadata=metadata or {}) - if filename_as_id: - doc.uid = str(input_file) - + if filename_as_id: doc.uid = str(input_file) documents.append(doc) return documents - def load_data(self, show_progress: bool = False, num_workers: Optional[int] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: documents = [] fs = fs or self._fs @@ -231,8 +218,9 @@ def load_data(self, show_progress: bool = False, num_workers: Optional[int] = No "Setting `num_workers` down to the maximum CPU count.") with multiprocessing.get_context("spawn").Pool(num_workers) as p: results = p.starmap(SimpleDirectoryReader.load_file, - zip(process_file, repeat(self._file_metadata), repeat(self._filename_as_id), - repeat(self._encoding), repeat(self._fs))) + zip(process_file, repeat(self._file_metadata), repeat(self._file_extractor), + repeat(self._filename_as_id), repeat(self._encoding), repeat(self._Path), + repeat(self._fs))) documents = reduce(lambda x, y: x + y, results) else: if show_progress: @@ -240,17 +228,36 @@ def load_data(self, show_progress: bool = False, num_workers: Optional[int] = No for input_file in process_file: documents.extend( SimpleDirectoryReader.load_file(input_file=input_file, file_metadata=self._file_metadata, - filename_as_id=self._filename_as_id, encoding=self._encoding, - fs=self._fs)) + file_extractor=self._file_extractor, encoding=self._encoding, + filename_as_id=self._filename_as_id, _Path=self._Path, fs=self._fs)) return self._exclude_metadata(documents) - @classmethod - def register_file_reader(cls, target_class: ReaderBase): - clazz = target_class.__name__.casefold().split("reader")[0] - cls._registered_file_reader_cls["." + clazz] = target_class - return target_class + def forward(self, *args, **kwargs) -> List[DocNode]: + return self._load_data(*args, **kwargs) - @classmethod - def get_registry(cls): - return cls._registered_file_reader_cls + @staticmethod + def _loading_default_file_reader() -> Dict[str, Callable]: + default_file_reader: Dict[str, Callable] = { + "*.pdf": PDFReader(), + "*.docx": DocxReader(), + "*.hwp": HWPReader(), + "*.pptx": PPTXReader(), + "*.ppt": PPTXReader(), + "*.pptm": PPTXReader(), + "*.gif": ImageReader(), + "*.jpeg": ImageReader(), + "*.jpg": ImageReader(), + "*.png": ImageReader(), + "*.webp": ImageReader(), + "*.ipynb": IPYNBReader(), + "*.epub": EpubReader(), + "*.md": MarkdownReader(), + "*.mbox": MboxReader(), + "*.csv": PandasCSVReader(), + "*.xls": PandasExcelReader(), + "*.xlsx": PandasExcelReader(), + "*.mp3": VideoAudioReader(), + "*.mp4": VideoAudioReader(), + } + return default_file_reader diff --git a/lazyllm/tools/rag/data_loaders.py b/lazyllm/tools/rag/data_loaders.py index e91b48ea..fce712df 100644 --- a/lazyllm/tools/rag/data_loaders.py +++ b/lazyllm/tools/rag/data_loaders.py @@ -1,20 +1,24 @@ -from typing import List, Optional +from typing import List, Optional, Dict, Callable from .store import DocNode, LAZY_ROOT_NAME from lazyllm import LOG from .dataReader import SimpleDirectoryReader class DirectoryReader: - def __init__(self, input_files: List[str]) -> None: + def __init__(self, input_files: List[str], readers: Optional[Dict] = None) -> None: self._input_files = input_files + self._readers = readers def load_data(self, input_files: Optional[List[str]] = None) -> List[DocNode]: input_files = input_files or self._input_files LOG.info(f"DirectoryReader loads data, input files: {input_files}") - reader = SimpleDirectoryReader(input_files=input_files) - nodes = [doc for doc in reader.load_data() if setattr(doc, 'group', LAZY_ROOT_NAME) is None] + reader = SimpleDirectoryReader(input_files=input_files, file_extractor=self._readers) + nodes = [doc for doc in reader() if setattr(doc, 'group', LAZY_ROOT_NAME) is None] if not nodes: LOG.warning( f"No nodes load from path {self.input_files}, please check your data path." ) return nodes + + def register_file_reader(self, readers: Dict[str, Callable]): + self._readers = readers diff --git a/lazyllm/tools/rag/doc_impl.py b/lazyllm/tools/rag/doc_impl.py index 0ce5a1ae..bc0c5dfc 100644 --- a/lazyllm/tools/rag/doc_impl.py +++ b/lazyllm/tools/rag/doc_impl.py @@ -25,9 +25,9 @@ def wrapper(*args, **kwargs) -> List[float]: class DocImpl: - def __init__(self, embed, doc_files=Optional[List[str]], **kwargs): + def __init__(self, embed, doc_files=Optional[List[str]], readers: Optional[Dict] = None, **kwargs): super().__init__() - self.directory_reader = DirectoryReader(doc_files) + self.directory_reader = DirectoryReader(doc_files, readers=readers) self.node_groups: Dict[str, Dict] = {LAZY_ROOT_NAME: {}} self._create_node_group_default() self.embed = embed_wrapper(embed) diff --git a/lazyllm/tools/rag/document.py b/lazyllm/tools/rag/document.py index 7e4a311a..23c930af 100644 --- a/lazyllm/tools/rag/document.py +++ b/lazyllm/tools/rag/document.py @@ -1,7 +1,7 @@ from functools import partial import os -from typing import Callable, Optional +from typing import Callable, Optional, Dict import lazyllm from lazyllm import ModuleBase, ServerModule, TrainableModule @@ -12,6 +12,8 @@ class Document(ModuleBase): + _registered_file_reader: Dict[str, Callable] = {} + def __init__(self, dataset_path: str, embed: Optional[TrainableModule] = None, create_ui: bool = True, launcher=None): super().__init__() @@ -21,15 +23,13 @@ def __init__(self, dataset_path: str, embed: Optional[TrainableModule] = None, dataset_path = defatult_path self._create_ui = create_ui launcher = launcher if launcher else lazyllm.launchers.remote(sync=False) + self._instance_registered_file_reader: Dict[str, Callable] = {} + self._impl = DocGroupImpl(dataset_path=dataset_path, embed=embed, readers=self._registered_file_reader) if create_ui: - self._impl = DocGroupImpl(dataset_path=dataset_path, embed=embed) doc_manager = DocManager(self._impl) self.doc_server = ServerModule(doc_manager, launcher=launcher) - self.web = DocWebModule(doc_server=self.doc_server) - else: - self._impl = DocGroupImpl(dataset_path=dataset_path, embed=embed) def forward(self, func_name: str, *args, **kwargs): if self._create_ui: @@ -51,3 +51,24 @@ def create_node_group( self, name: str, transform: Callable, parent: str = LAZY_ROOT_NAME, **kwargs ) -> None: self._impl.create_node_group(name, transform, parent, **kwargs) + + def register_instance_file_reader(self, readers: Dict[str, Callable]): + self._instance_registered_file_reader.update(readers) + self._impl._impl.directory_reader.register_file_reader({**self._instance_registered_file_reader, + **self._registered_file_reader}) + + @classmethod + def register_cls_file_reader(cls, match_key: str): + if isinstance(match_key, type): + raise TypeError("Document.register_file_reader() missing 1 required positional argument: 'match_key'") + + def decorator(klass): + if isinstance(klass, type): cls._registered_file_reader[match_key] = klass() + elif callable(klass): cls._registered_file_reader[match_key] = klass + else: raise TypeError(f"The registered object {klass} is not a callable object.") + return klass + return decorator + + @classmethod + def get_registry(cls): + return cls._registered_file_reader diff --git a/lazyllm/tools/rag/group_doc.py b/lazyllm/tools/rag/group_doc.py index aac17afb..e0af0e48 100644 --- a/lazyllm/tools/rag/group_doc.py +++ b/lazyllm/tools/rag/group_doc.py @@ -1,6 +1,6 @@ import os import shutil -from typing import Callable, List +from typing import Callable, List, Optional, Dict from .doc_impl import DocImpl import lazyllm from .store import LAZY_ROOT_NAME @@ -10,7 +10,7 @@ class DocGroupImpl(lazyllm.ModuleBase): - def __init__(self, dataset_path, embed) -> None: + def __init__(self, dataset_path, embed, readers: Optional[Dict] = None) -> None: super().__init__() self._dataset_path = dataset_path self._embed = embed @@ -22,7 +22,7 @@ def __init__(self, dataset_path, embed) -> None: file_paths = self._list_all_files(self.dataset_path, lambda x: DATA_DIR in x) self._impl: DocImpl = DocImpl( - doc_files=file_paths, embed=self._embed, doc_name="lazyllm_doc" + doc_files=file_paths, embed=self._embed, readers=readers, doc_name="lazyllm_doc" ) @property diff --git a/lazyllm/tools/rag/readers/docxReader.py b/lazyllm/tools/rag/readers/docxReader.py index 308a29cb..ff472013 100644 --- a/lazyllm/tools/rag/readers/docxReader.py +++ b/lazyllm/tools/rag/readers/docxReader.py @@ -6,8 +6,8 @@ from ..store import DocNode class DocxReader(LazyLLMReaderBase): - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: if not isinstance(file, Path): file = Path(file) try: import docx2txt diff --git a/lazyllm/tools/rag/readers/epubReader.py b/lazyllm/tools/rag/readers/epubReader.py index 699993a8..0e208dbf 100644 --- a/lazyllm/tools/rag/readers/epubReader.py +++ b/lazyllm/tools/rag/readers/epubReader.py @@ -7,8 +7,8 @@ from lazyllm import LOG class EpubReader(LazyLLMReaderBase): - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: try: import ebooklib import html2text diff --git a/lazyllm/tools/rag/readers/hwpReader.py b/lazyllm/tools/rag/readers/hwpReader.py index 916b5197..9678336e 100644 --- a/lazyllm/tools/rag/readers/hwpReader.py +++ b/lazyllm/tools/rag/readers/hwpReader.py @@ -9,8 +9,8 @@ from lazyllm import LOG class HWPReader(LazyLLMReaderBase): - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) + def __init__(self, return_trace: bool = True) -> None: + super().__init__(return_trace=return_trace) self._FILE_HEADER_SECTION = "FileHeader" self._HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation" self._SECTION_NAME_LENGTH = len("Section") @@ -18,8 +18,8 @@ def __init__(self, *args, **kwargs) -> None: self._HWP_TEXT_TAGS = [67] self._text = "" - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: try: import olefile except ImportError: diff --git a/lazyllm/tools/rag/readers/imageReader.py b/lazyllm/tools/rag/readers/imageReader.py index 7a085d77..ee610bbc 100644 --- a/lazyllm/tools/rag/readers/imageReader.py +++ b/lazyllm/tools/rag/readers/imageReader.py @@ -2,7 +2,7 @@ import re from io import BytesIO from pathlib import Path -from typing import Dict, List, Optional, Any, cast +from typing import Dict, List, Optional, cast from fsspec import AbstractFileSystem from PIL import Image @@ -20,7 +20,9 @@ def b64_2_img(data: str) -> Image: class ImageReader(LazyLLMReaderBase): def __init__(self, parser_config: Optional[Dict] = None, keep_image: bool = False, parse_text: bool = False, - text_type: str = "text", pytesseract_model_kwargs: Dict[str, Any] = {}) -> None: + text_type: str = "text", pytesseract_model_kwargs: Optional[Dict] = None, + return_trace: bool = True) -> None: + super().__init__(return_trace=return_trace) self._text_type = text_type if parser_config is None and parse_text: if text_type == "plain_text": @@ -49,10 +51,10 @@ def __init__(self, parser_config: Optional[Dict] = None, keep_image: bool = Fals self._parser_config = parser_config self._keep_image = keep_image self._parse_text = parse_text - self._pytesseract_model_kwargs = pytesseract_model_kwargs + self._pytesseract_model_kwargs = pytesseract_model_kwargs or {} - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: if not isinstance(file, Path): file = Path(file) if fs: diff --git a/lazyllm/tools/rag/readers/ipynbReader.py b/lazyllm/tools/rag/readers/ipynbReader.py index d08bd992..66c0e192 100644 --- a/lazyllm/tools/rag/readers/ipynbReader.py +++ b/lazyllm/tools/rag/readers/ipynbReader.py @@ -7,12 +7,13 @@ from ..store import DocNode class IPYNBReader(LazyLLMReaderBase): - def __init__(self, parser_config: Optional[Dict] = None, concatenate: bool = False): + def __init__(self, parser_config: Optional[Dict] = None, concatenate: bool = False, return_trace: bool = True): + super().__init__(return_trace=return_trace) self._parser_config = parser_config self._concatenate = concatenate - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: if not isinstance(file, Path): file = Path(file) if file.name.endswith(".ipynb"): diff --git a/lazyllm/tools/rag/readers/markdownReader.py b/lazyllm/tools/rag/readers/markdownReader.py index 1a03a9f5..c1748f55 100644 --- a/lazyllm/tools/rag/readers/markdownReader.py +++ b/lazyllm/tools/rag/readers/markdownReader.py @@ -8,8 +8,8 @@ from ..store import DocNode class MarkdownReader(LazyLLMReaderBase): - def __init__(self, *args, remove_hyperlinks: bool = True, remove_images: bool = True, **kwargs) -> None: - super().__init__(*args, **kwargs) + def __init__(self, remove_hyperlinks: bool = True, remove_images: bool = True, return_trace: bool = True) -> None: + super().__init__(return_trace=return_trace) self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images @@ -56,8 +56,8 @@ def _parse_tups(self, filepath: Path, errors: str = "ignore", if self._remove_images: content = self.remove_images(content) return self._markdown_to_tups(content) - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: if not isinstance(file, Path): file = Path(file) tups = self._parse_tups(file, fs=fs) diff --git a/lazyllm/tools/rag/readers/mboxreader.py b/lazyllm/tools/rag/readers/mboxreader.py index 93537406..567854c8 100644 --- a/lazyllm/tools/rag/readers/mboxreader.py +++ b/lazyllm/tools/rag/readers/mboxreader.py @@ -15,18 +15,19 @@ class MboxReader(LazyLLMReaderBase): "Content: {_content}" ) - def __init__(self, *args, max_count: int = 0, message_format: str = DEFAULT_MESSAGE_FORMAT, **kwargs) -> None: + def __init__(self, max_count: int = 0, message_format: str = DEFAULT_MESSAGE_FORMAT, + return_trace: bool = True) -> None: try: from bs4 import BeautifulSoup # noqa except ImportError: raise ImportError("`BeautifulSoup` package not found: `pip install beautifulsoup4`") - super().__init__(*args, **kwargs) + super().__init__(return_trace=return_trace) self._max_count = max_count self._message_format = message_format - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: import mailbox from email.parser import BytesParser from email.policy import default diff --git a/lazyllm/tools/rag/readers/pandasReader.py b/lazyllm/tools/rag/readers/pandasReader.py index 69d2c75d..bbe2cb60 100644 --- a/lazyllm/tools/rag/readers/pandasReader.py +++ b/lazyllm/tools/rag/readers/pandasReader.py @@ -8,16 +8,16 @@ from ..store import DocNode class PandasCSVReader(LazyLLMReaderBase): - def __init__(self, *args, concat_rows: bool = True, col_joiner: str = ", ", row_joiner: str = "\n", - pandas_config: dict = {}, **kwargs) -> None: - super().__init__(*args, **kwargs) + def __init__(self, concat_rows: bool = True, col_joiner: str = ", ", row_joiner: str = "\n", + pandas_config: Optional[Dict] = None, return_trace: bool = True) -> None: + super().__init__(return_trace=return_trace) self._concat_rows = concat_rows self._col_joiner = col_joiner self._row_joiner = row_joiner - self._pandas_config = pandas_config + self._pandas_config = pandas_config or {} - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: if not isinstance(file, Path): file = Path(file) if fs: @@ -32,15 +32,15 @@ def load_data(self, file: Path, extra_info: Optional[Dict] = None, else: return [DocNode(text=text, metadata=extra_info or {}) for text in text_list] class PandasExcelReader(LazyLLMReaderBase): - def __init__(self, *args, concat_rows: bool = True, sheet_name: Optional[str] = None, - pandas_config: dict = {}, **kwargs) -> None: - super().__init__(*args, **kwargs) + def __init__(self, concat_rows: bool = True, sheet_name: Optional[str] = None, + pandas_config: Optional[Dict] = None, return_trace: bool = True) -> None: + super().__init__(return_trace=return_trace) self._concat_rows = concat_rows self._sheet_name = sheet_name - self._pandas_config = pandas_config + self._pandas_config = pandas_config or {} - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: openpyxl_spec = importlib.util.find_spec("openpyxl") if openpyxl_spec is not None: pass else: raise ImportError("Please install openpyxl to read Excel files. " diff --git a/lazyllm/tools/rag/readers/pdfReader.py b/lazyllm/tools/rag/readers/pdfReader.py index 315cdca8..8982c424 100644 --- a/lazyllm/tools/rag/readers/pdfReader.py +++ b/lazyllm/tools/rag/readers/pdfReader.py @@ -10,12 +10,13 @@ RETRY_TIMES = 3 class PDFReader(LazyLLMReaderBase): - def __init__(self, return_full_document: bool = False) -> None: + def __init__(self, return_full_document: bool = False, return_trace: bool = True) -> None: + super().__init__(return_trace=return_trace) self._return_full_document = return_full_document @retry(stop=stop_after_attempt(RETRY_TIMES)) - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: if not isinstance(file, Path): file = Path(file) try: diff --git a/lazyllm/tools/rag/readers/pptxReader.py b/lazyllm/tools/rag/readers/pptxReader.py index fb9ef304..8085844d 100644 --- a/lazyllm/tools/rag/readers/pptxReader.py +++ b/lazyllm/tools/rag/readers/pptxReader.py @@ -8,7 +8,7 @@ from ..store import DocNode class PPTXReader(LazyLLMReaderBase): - def __init__(self) -> None: + def __init__(self, return_trace: bool = True) -> None: try: import torch # noqa from PIL import Image # noqa @@ -18,6 +18,7 @@ def __init__(self) -> None: raise ImportError("Please install extra dependencies that are required for the " "PPTXReader: `pip install torch transformers python-pptx Pillow`") + super().__init__(return_trace=return_trace) model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") @@ -49,8 +50,8 @@ def _caption_image(self, tmp_image_file: str) -> str: preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) return preds[0].strip() - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: from pptx import Presentation if not isinstance(file, Path): file = Path(file) diff --git a/lazyllm/tools/rag/readers/readerBase.py b/lazyllm/tools/rag/readers/readerBase.py index 14f2d0af..70515e52 100644 --- a/lazyllm/tools/rag/readers/readerBase.py +++ b/lazyllm/tools/rag/readers/readerBase.py @@ -4,13 +4,20 @@ from ....common import LazyLLMRegisterMetaClass from ..store import DocNode +from lazyllm.module import ModuleBase -class LazyLLMReaderBase(metaclass=LazyLLMRegisterMetaClass): - def lazy_load_data(self, *args, **load_kwargs) -> Iterable[DocNode]: +class LazyLLMReaderBase(ModuleBase, metaclass=LazyLLMRegisterMetaClass): + def __init__(self, *args, return_trace: bool = True, **kwargs): + super().__init__(return_trace=return_trace) + + def _lazy_load_data(self, *args, **load_kwargs) -> Iterable[DocNode]: raise NotImplementedError(f"{self.__class__.__name__} does not implement lazy_load_data method.") - def load_data(self, *args, **load_kwargs) -> List[DocNode]: - return list(self.lazy_load_data(*args, **load_kwargs)) + def _load_data(self, *args, **load_kwargs) -> List[DocNode]: + return list(self._lazy_load_data(*args, **load_kwargs)) + + def forward(self, *args, **kwargs) -> List[DocNode]: + return self._load_data(*args, **kwargs) def get_default_fs(): diff --git a/lazyllm/tools/rag/readers/videoAudioReader.py b/lazyllm/tools/rag/readers/videoAudioReader.py index f375d82e..02236e75 100644 --- a/lazyllm/tools/rag/readers/videoAudioReader.py +++ b/lazyllm/tools/rag/readers/videoAudioReader.py @@ -6,8 +6,8 @@ from ..store import DocNode class VideoAudioReader(LazyLLMReaderBase): - def __init__(self, *args, model_version: str = "base", **kwargs) -> None: - super().__init__(*args, **kwargs) + def __init__(self, model_version: str = "base", return_trace: bool = True) -> None: + super().__init__(return_trace=return_trace) self._model_version = model_version try: @@ -19,8 +19,8 @@ def __init__(self, *args, model_version: str = "base", **kwargs) -> None: model = whisper.load_model(self._model_version) self._parser_config = {"model": model} - def load_data(self, file: Path, extra_info: Optional[Dict] = None, - fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: + def _load_data(self, file: Path, extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: import whisper if not isinstance(file, Path): file = Path(file) diff --git a/lazyllm/tools/rag/store.py b/lazyllm/tools/rag/store.py index 374f92d0..e304ab5c 100644 --- a/lazyllm/tools/rag/store.py +++ b/lazyllm/tools/rag/store.py @@ -28,13 +28,13 @@ def __init__( group: Optional[str] = None, embedding: Optional[List[float]] = None, parent: Optional["DocNode"] = None, - metadata: Optional[Dict[str, Any]] = {}, + metadata: Optional[Dict[str, Any]] = None, ) -> None: self.uid: str = uid if uid else str(uuid.uuid4()) self.text: Optional[str] = text self.group: Optional[str] = group self.embedding: Optional[List[float]] = embedding or None - self._metadata: Dict[str, Any] = metadata + self._metadata: Dict[str, Any] = metadata or {} # Metadata keys that are excluded from text for the embed model. self._excluded_embed_metadata_keys: List[str] = [] # Metadata keys that are excluded from text for the LLM. diff --git a/tests/basic_tests/test_rag_reader.py b/tests/basic_tests/test_rag_reader.py new file mode 100644 index 00000000..79a7a8ba --- /dev/null +++ b/tests/basic_tests/test_rag_reader.py @@ -0,0 +1,44 @@ +import os +import lazyllm +from lazyllm import SimpleDirectoryReader, Document +from lazyllm.tools.rag.readers import ReaderBase +from lazyllm.tools.rag import DocNode + +class YmlReader(ReaderBase): + def _load_data(self, file, extra_info=None, fs=None): + with open(file, 'r') as f: + data = f.read() + return [DocNode(text=data, metadata=extra_info or {})] + +class TestRagReader(object): + @classmethod + def setup_class(cls): + cls.doc = Document(dataset_path="ci_data", create_ui=False) + cls.datasets = os.path.join(lazyllm.config['data_path'], "ci_data/default/__data/sources") + + def test_reader_file(self): + files = [os.path.join(self.datasets, "联网搜索.pdf"), os.path.join(self.datasets, "说明文档测试.docx")] + reader = SimpleDirectoryReader(input_files=files) + docs = [] + for doc in reader(): + print(doc) + docs.append(doc) + print(len(docs)) + assert len(docs) == 3 + + def test_reader_dir(self): + input_dir = self.datasets + reader = SimpleDirectoryReader(input_dir=input_dir, + exclude=["*.jpg", "*.mp3", "*.yml", "*.pdf", ".docx", "*.pptx"]) + docs = [] + for doc in reader(): + print(doc) + docs.append(doc) + print(len(docs)) + assert len(docs) == 13 + + def test_register_reader(self): + self.doc.register_instance_file_reader({lazyllm.config['data_path'] + "/**/*.yml": YmlReader()}) + files = [os.path.join(self.datasets, "reader_test.yml")] + docs = self.doc._impl._impl.directory_reader.load_data(input_files=files) + assert len(docs) == 1 From 7b6bc80159a97eab2bc7da982009d54b14caaea9 Mon Sep 17 00:00:00 2001 From: wangjian Date: Fri, 13 Sep 2024 19:33:17 +0800 Subject: [PATCH 04/11] optimizing rag readers register --- lazyllm/__init__.py | 3 +- lazyllm/tools/__init__.py | 3 +- lazyllm/tools/rag/dataReader.py | 88 +++++++++++++--------------- lazyllm/tools/rag/data_loaders.py | 19 +++--- lazyllm/tools/rag/doc_impl.py | 5 +- lazyllm/tools/rag/document.py | 24 +++----- lazyllm/tools/rag/group_doc.py | 6 +- lazyllm/tools/rag/readers/readme.md | 1 + tests/basic_tests/test_rag_reader.py | 6 +- 9 files changed, 75 insertions(+), 80 deletions(-) create mode 100644 lazyllm/tools/rag/readers/readme.md diff --git a/lazyllm/__init__.py b/lazyllm/__init__.py index bce51dd2..5ba10262 100644 --- a/lazyllm/__init__.py +++ b/lazyllm/__init__.py @@ -15,7 +15,7 @@ from .client import redis_client from .tools import (Document, Reranker, Retriever, WebModule, ToolManager, FunctionCall, FunctionCallAgent, fc_register, ReactAgent, PlanAndSolveAgent, ReWOOAgent, SentenceSplitter, - LLMParser, SimpleDirectoryReader) + LLMParser) from .docs import add_doc config.done() @@ -73,7 +73,6 @@ 'PlanAndSolveAgent', 'ReWOOAgent', 'SentenceSplitter', - 'SimpleDirectoryReader', # docs 'add_doc', diff --git a/lazyllm/tools/__init__.py b/lazyllm/tools/__init__.py index 676f54f9..09789418 100644 --- a/lazyllm/tools/__init__.py +++ b/lazyllm/tools/__init__.py @@ -1,4 +1,4 @@ -from .rag import (Document, Reranker, Retriever, SentenceSplitter, LLMParser, SimpleDirectoryReader) +from .rag import Document, Reranker, Retriever, SentenceSplitter, LLMParser from .webpages import WebModule from .agent import ( ToolManager, @@ -29,5 +29,4 @@ "SentenceSplitter", "SQLiteTool", "SqlModule", - "SimpleDirectoryReader", ] diff --git a/lazyllm/tools/rag/dataReader.py b/lazyllm/tools/rag/dataReader.py index 1b05a879..cc04c030 100644 --- a/lazyllm/tools/rag/dataReader.py +++ b/lazyllm/tools/rag/dataReader.py @@ -1,3 +1,7 @@ +""" +The overall process of SimpleDirectoryReader is borrowed from LLAMA_INDEX, but we have added a customized part +based on it, that is, allowing users to register custom rules instead of processing only based on file suffixes. +""" import os import mimetypes import multiprocessing @@ -6,12 +10,12 @@ from datetime import datetime from functools import reduce from itertools import repeat -from typing import Dict, Optional, List, Callable +from typing import Dict, Optional, List, Callable, Type from pathlib import Path, PurePosixPath, PurePath from fsspec import AbstractFileSystem from lazyllm import ModuleBase, LOG from .store import DocNode -from .readers import (PDFReader, DocxReader, HWPReader, PPTXReader, ImageReader, IPYNBReader, +from .readers import (ReaderBase, PDFReader, DocxReader, HWPReader, PPTXReader, ImageReader, IPYNBReader, EpubReader, MarkdownReader, MboxReader, PandasCSVReader, PandasExcelReader, VideoAudioReader, get_default_fs, is_default_fs) @@ -51,6 +55,29 @@ def __call__(self, file_path: str) -> Dict: return {meta_key: meta_value for meta_key, meta_value in default_meta.items() if meta_value is not None} class SimpleDirectoryReader(ModuleBase): + default_file_readers: Dict[str, Type[ReaderBase]] = { + "*.pdf": PDFReader, + "*.docx": DocxReader, + "*.hwp": HWPReader, + "*.pptx": PPTXReader, + "*.ppt": PPTXReader, + "*.pptm": PPTXReader, + "*.gif": ImageReader, + "*.jpeg": ImageReader, + "*.jpg": ImageReader, + "*.png": ImageReader, + "*.webp": ImageReader, + "*.ipynb": IPYNBReader, + "*.epub": EpubReader, + "*.md": MarkdownReader, + "*.mbox": MboxReader, + "*.csv": PandasCSVReader, + "*.xls": PandasExcelReader, + "*.xlsx": PandasExcelReader, + "*.mp3": VideoAudioReader, + "*.mp4": VideoAudioReader, + } + def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List] = None, exclude: Optional[List] = None, exclude_hidden: bool = True, recursive: bool = False, encoding: str = "utf-8", filename_as_id: bool = False, required_exts: Optional[List[str]] = None, @@ -59,7 +86,7 @@ def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List] return_trace: bool = False) -> None: super().__init__(return_trace=return_trace) - if not input_dir and not input_files: + if (not input_dir and not input_files) or (input_dir and input_files): raise ValueError("Must provide either `input_dir` or `input_files`.") self._fs = fs or get_default_fs() @@ -85,10 +112,7 @@ def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List] self._input_dir = self._Path(input_dir) self._input_files = self._add_files(self._input_dir) - if file_extractor is not None: - self._file_extractor = file_extractor - else: - self._file_extractor = {} + self._file_extractor = file_extractor or {} self._file_metadata = file_metadata or _DefaultFileMetadataFunc(self._fs) self._filename_as_id = filename_as_id @@ -166,25 +190,21 @@ def _exclude_metadata(self, documents: List[DocNode]) -> List[DocNode]: @staticmethod def load_file(input_file: Path, file_metadata: Callable[[str], Dict], file_extractor: Dict[str, Callable], - filename_as_id: bool = False, encoding: str = "utf-8", _Path: PurePath = Path, + filename_as_id: bool = False, encoding: str = "utf-8", pathm: PurePath = Path, fs: Optional[AbstractFileSystem] = None) -> List[DocNode]: - default_file_reader = SimpleDirectoryReader._loading_default_file_reader() - default_file_reader_patterns = list(default_file_reader.keys()) metadata: Optional[dict] = None documents: List[DocNode] = [] if file_metadata is not None: metadata = file_metadata(str(input_file)) file_reader_patterns = list(file_extractor.keys()) - file_reader_patterns.extend(default_file_reader_patterns) for pattern in file_reader_patterns: - pt = str(_Path(pattern)) - match_pattern = pt if pt.startswith("*") else os.path.join(str(_Path.cwd()), pt) + pt = str(pathm(pattern)) + match_pattern = pt if pt.startswith("*") else os.path.join(str(pathm.cwd()), pt) if fnmatch.fnmatch(input_file, match_pattern): - reader = file_extractor[pattern] if pattern not in default_file_reader_patterns \ - else default_file_reader[pattern] - + reader = file_extractor[pattern] + reader = reader() if isinstance(reader, type) else reader kwargs = {"extra_info": metadata} if fs and not is_default_fs(fs): kwargs['fs'] = fs docs = reader(input_file, **kwargs) @@ -212,13 +232,14 @@ def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = N fs = fs or self._fs process_file = self._input_files - if num_workers and num_workers > 1: + if num_workers and num_workers >= 1: if num_workers > multiprocessing.cpu_count(): LOG.warning("Specified num_workers exceed number of CPUs in the system. " "Setting `num_workers` down to the maximum CPU count.") with multiprocessing.get_context("spawn").Pool(num_workers) as p: results = p.starmap(SimpleDirectoryReader.load_file, - zip(process_file, repeat(self._file_metadata), repeat(self._file_extractor), + zip(process_file, repeat(self._file_metadata), + repeat({**self._file_extractor, **self.default_file_readers}), repeat(self._filename_as_id), repeat(self._encoding), repeat(self._Path), repeat(self._fs))) documents = reduce(lambda x, y: x + y, results) @@ -228,36 +249,11 @@ def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = N for input_file in process_file: documents.extend( SimpleDirectoryReader.load_file(input_file=input_file, file_metadata=self._file_metadata, - file_extractor=self._file_extractor, encoding=self._encoding, - filename_as_id=self._filename_as_id, _Path=self._Path, fs=self._fs)) + file_extractor={**self._file_extractor, **self.default_file_readers}, + filename_as_id=self._filename_as_id, encoding=self._encoding, + pathm=self._Path, fs=self._fs)) return self._exclude_metadata(documents) def forward(self, *args, **kwargs) -> List[DocNode]: return self._load_data(*args, **kwargs) - - @staticmethod - def _loading_default_file_reader() -> Dict[str, Callable]: - default_file_reader: Dict[str, Callable] = { - "*.pdf": PDFReader(), - "*.docx": DocxReader(), - "*.hwp": HWPReader(), - "*.pptx": PPTXReader(), - "*.ppt": PPTXReader(), - "*.pptm": PPTXReader(), - "*.gif": ImageReader(), - "*.jpeg": ImageReader(), - "*.jpg": ImageReader(), - "*.png": ImageReader(), - "*.webp": ImageReader(), - "*.ipynb": IPYNBReader(), - "*.epub": EpubReader(), - "*.md": MarkdownReader(), - "*.mbox": MboxReader(), - "*.csv": PandasCSVReader(), - "*.xls": PandasExcelReader(), - "*.xlsx": PandasExcelReader(), - "*.mp3": VideoAudioReader(), - "*.mp4": VideoAudioReader(), - } - return default_file_reader diff --git a/lazyllm/tools/rag/data_loaders.py b/lazyllm/tools/rag/data_loaders.py index fce712df..4710eb14 100644 --- a/lazyllm/tools/rag/data_loaders.py +++ b/lazyllm/tools/rag/data_loaders.py @@ -1,24 +1,27 @@ -from typing import List, Optional, Dict, Callable +from typing import List, Optional, Dict from .store import DocNode, LAZY_ROOT_NAME from lazyllm import LOG from .dataReader import SimpleDirectoryReader class DirectoryReader: - def __init__(self, input_files: List[str], readers: Optional[Dict] = None) -> None: + def __init__(self, input_files: List[str], local_readers: Optional[Dict] = None, + global_readers: Optional[Dict] = None) -> None: self._input_files = input_files - self._readers = readers + self._local_readers = local_readers + self._global_readers = global_readers def load_data(self, input_files: Optional[List[str]] = None) -> List[DocNode]: input_files = input_files or self._input_files LOG.info(f"DirectoryReader loads data, input files: {input_files}") - reader = SimpleDirectoryReader(input_files=input_files, file_extractor=self._readers) - nodes = [doc for doc in reader() if setattr(doc, 'group', LAZY_ROOT_NAME) is None] + reader = SimpleDirectoryReader(input_files=input_files, + file_extractor={**self._local_readers, **self._global_readers}) + nodes: List[DocNode] = [] + for doc in reader(): + doc.group = LAZY_ROOT_NAME + nodes.append(doc) if not nodes: LOG.warning( f"No nodes load from path {self.input_files}, please check your data path." ) return nodes - - def register_file_reader(self, readers: Dict[str, Callable]): - self._readers = readers diff --git a/lazyllm/tools/rag/doc_impl.py b/lazyllm/tools/rag/doc_impl.py index bc0c5dfc..9088932a 100644 --- a/lazyllm/tools/rag/doc_impl.py +++ b/lazyllm/tools/rag/doc_impl.py @@ -25,9 +25,10 @@ def wrapper(*args, **kwargs) -> List[float]: class DocImpl: - def __init__(self, embed, doc_files=Optional[List[str]], readers: Optional[Dict] = None, **kwargs): + def __init__(self, embed, doc_files=Optional[List[str]], local_readers: Optional[Dict] = None, + global_readers: Optional[Dict] = None, **kwargs): super().__init__() - self.directory_reader = DirectoryReader(doc_files, readers=readers) + self.directory_reader = DirectoryReader(doc_files, local_readers=local_readers, global_readers=global_readers) self.node_groups: Dict[str, Dict] = {LAZY_ROOT_NAME: {}} self._create_node_group_default() self.embed = embed_wrapper(embed) diff --git a/lazyllm/tools/rag/document.py b/lazyllm/tools/rag/document.py index 23c930af..4c27ea85 100644 --- a/lazyllm/tools/rag/document.py +++ b/lazyllm/tools/rag/document.py @@ -23,9 +23,10 @@ def __init__(self, dataset_path: str, embed: Optional[TrainableModule] = None, dataset_path = defatult_path self._create_ui = create_ui launcher = launcher if launcher else lazyllm.launchers.remote(sync=False) - self._instance_registered_file_reader: Dict[str, Callable] = {} + self._local_file_reader: Dict[str, Callable] = {} - self._impl = DocGroupImpl(dataset_path=dataset_path, embed=embed, readers=self._registered_file_reader) + self._impl = DocGroupImpl(dataset_path=dataset_path, embed=embed, local_readers=self._local_file_reader, + global_readers=self._registered_file_reader) if create_ui: doc_manager = DocManager(self._impl) self.doc_server = ServerModule(doc_manager, launcher=launcher) @@ -52,23 +53,16 @@ def create_node_group( ) -> None: self._impl.create_node_group(name, transform, parent, **kwargs) - def register_instance_file_reader(self, readers: Dict[str, Callable]): - self._instance_registered_file_reader.update(readers) - self._impl._impl.directory_reader.register_file_reader({**self._instance_registered_file_reader, - **self._registered_file_reader}) + def add_reader(self, pattern: str, func: Callable): + self._local_file_reader[pattern] = func @classmethod - def register_cls_file_reader(cls, match_key: str): - if isinstance(match_key, type): - raise TypeError("Document.register_file_reader() missing 1 required positional argument: 'match_key'") + def register_global_reader(cls, pattern: str, func: Optional[Callable] = None): + if func is not None: + cls._registered_file_reader[pattern] = func def decorator(klass): - if isinstance(klass, type): cls._registered_file_reader[match_key] = klass() - elif callable(klass): cls._registered_file_reader[match_key] = klass + if callable(klass): cls._registered_file_reader[pattern] = klass else: raise TypeError(f"The registered object {klass} is not a callable object.") return klass return decorator - - @classmethod - def get_registry(cls): - return cls._registered_file_reader diff --git a/lazyllm/tools/rag/group_doc.py b/lazyllm/tools/rag/group_doc.py index e0af0e48..b19efbce 100644 --- a/lazyllm/tools/rag/group_doc.py +++ b/lazyllm/tools/rag/group_doc.py @@ -10,7 +10,8 @@ class DocGroupImpl(lazyllm.ModuleBase): - def __init__(self, dataset_path, embed, readers: Optional[Dict] = None) -> None: + def __init__(self, dataset_path, embed, local_readers: Optional[Dict] = None, + global_readers: Optional[Dict] = None) -> None: super().__init__() self._dataset_path = dataset_path self._embed = embed @@ -22,7 +23,8 @@ def __init__(self, dataset_path, embed, readers: Optional[Dict] = None) -> None: file_paths = self._list_all_files(self.dataset_path, lambda x: DATA_DIR in x) self._impl: DocImpl = DocImpl( - doc_files=file_paths, embed=self._embed, readers=readers, doc_name="lazyllm_doc" + doc_files=file_paths, embed=self._embed, local_readers=local_readers, global_readers=global_readers, + doc_name="lazyllm_doc" ) @property diff --git a/lazyllm/tools/rag/readers/readme.md b/lazyllm/tools/rag/readers/readme.md new file mode 100644 index 00000000..c3e8e627 --- /dev/null +++ b/lazyllm/tools/rag/readers/readme.md @@ -0,0 +1 @@ +Each reader module is borrowd from LLAMA_INDEX, but we have added customized parts, including the entire reader, which is inherited from the Modulebase base class, making all reader modules callable. diff --git a/tests/basic_tests/test_rag_reader.py b/tests/basic_tests/test_rag_reader.py index 79a7a8ba..91ad98b0 100644 --- a/tests/basic_tests/test_rag_reader.py +++ b/tests/basic_tests/test_rag_reader.py @@ -1,8 +1,8 @@ import os import lazyllm -from lazyllm import SimpleDirectoryReader, Document +from lazyllm import Document from lazyllm.tools.rag.readers import ReaderBase -from lazyllm.tools.rag import DocNode +from lazyllm.tools.rag import SimpleDirectoryReader, DocNode class YmlReader(ReaderBase): def _load_data(self, file, extra_info=None, fs=None): @@ -38,7 +38,7 @@ def test_reader_dir(self): assert len(docs) == 13 def test_register_reader(self): - self.doc.register_instance_file_reader({lazyllm.config['data_path'] + "/**/*.yml": YmlReader()}) + self.doc.add_reader("/**/*.yml", YmlReader) files = [os.path.join(self.datasets, "reader_test.yml")] docs = self.doc._impl._impl.directory_reader.load_data(input_files=files) assert len(docs) == 1 From 973c0234142f049962383dec9aaab551478e27db Mon Sep 17 00:00:00 2001 From: wangjian Date: Fri, 13 Sep 2024 19:47:14 +0800 Subject: [PATCH 05/11] add two package for readers in unit tests --- .github/workflows/main.yml | 1 + tests/requirements.txt | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cc84e4e8..2abe2275 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -61,6 +61,7 @@ jobs: run: | set -ex cd ${{ env.CI_PATH }} + pip install -r tests/requirements.txt realpath . env | grep '^SCC' export LAZYLLM_SCO_ENV_NAME=lazyllm diff --git a/tests/requirements.txt b/tests/requirements.txt index a9bc6c2e..953766fd 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1 +1,3 @@ -wikipedia \ No newline at end of file +wikipedia +docx2txt +olefile From 5eca597b85fe741a0e929795a169d40e6fb1a4dc Mon Sep 17 00:00:00 2001 From: wangjian Date: Fri, 13 Sep 2024 22:23:10 +0800 Subject: [PATCH 06/11] modify rag reader file path in unit test --- tests/basic_tests/test_rag_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/basic_tests/test_rag_reader.py b/tests/basic_tests/test_rag_reader.py index 91ad98b0..0910429a 100644 --- a/tests/basic_tests/test_rag_reader.py +++ b/tests/basic_tests/test_rag_reader.py @@ -13,8 +13,8 @@ def _load_data(self, file, extra_info=None, fs=None): class TestRagReader(object): @classmethod def setup_class(cls): - cls.doc = Document(dataset_path="ci_data", create_ui=False) - cls.datasets = os.path.join(lazyllm.config['data_path'], "ci_data/default/__data/sources") + cls.doc = Document(dataset_path="ci_data/rag_reader", create_ui=False) + cls.datasets = os.path.join(lazyllm.config['data_path'], "ci_data/rag_reader/default/__data/sources") def test_reader_file(self): files = [os.path.join(self.datasets, "联网搜索.pdf"), os.path.join(self.datasets, "说明文档测试.docx")] From c03756be749697dea112cabceec75c77531b3683 Mon Sep 17 00:00:00 2001 From: wangjian Date: Sat, 14 Sep 2024 15:19:08 +0800 Subject: [PATCH 07/11] Optimize the usage priority of local and global registered file readers --- lazyllm/tools/rag/dataReader.py | 10 ++++----- lazyllm/tools/rag/data_loaders.py | 6 +++--- tests/basic_tests/test_rag_reader.py | 31 +++++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/lazyllm/tools/rag/dataReader.py b/lazyllm/tools/rag/dataReader.py index cc04c030..ac2408a0 100644 --- a/lazyllm/tools/rag/dataReader.py +++ b/lazyllm/tools/rag/dataReader.py @@ -231,6 +231,8 @@ def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = N fs = fs or self._fs process_file = self._input_files + file_readers = {k: self._file_extractor.get(k, self.default_file_readers.get(k)) + for k in self._file_extractor.keys() | self.default_file_readers.keys()} if num_workers and num_workers >= 1: if num_workers > multiprocessing.cpu_count(): @@ -238,8 +240,7 @@ def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = N "Setting `num_workers` down to the maximum CPU count.") with multiprocessing.get_context("spawn").Pool(num_workers) as p: results = p.starmap(SimpleDirectoryReader.load_file, - zip(process_file, repeat(self._file_metadata), - repeat({**self._file_extractor, **self.default_file_readers}), + zip(process_file, repeat(self._file_metadata), repeat(file_readers), repeat(self._filename_as_id), repeat(self._encoding), repeat(self._Path), repeat(self._fs))) documents = reduce(lambda x, y: x + y, results) @@ -249,9 +250,8 @@ def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = N for input_file in process_file: documents.extend( SimpleDirectoryReader.load_file(input_file=input_file, file_metadata=self._file_metadata, - file_extractor={**self._file_extractor, **self.default_file_readers}, - filename_as_id=self._filename_as_id, encoding=self._encoding, - pathm=self._Path, fs=self._fs)) + file_extractor=file_readers, filename_as_id=self._filename_as_id, + encoding=self._encoding, pathm=self._Path, fs=self._fs)) return self._exclude_metadata(documents) diff --git a/lazyllm/tools/rag/data_loaders.py b/lazyllm/tools/rag/data_loaders.py index 4710eb14..4d702484 100644 --- a/lazyllm/tools/rag/data_loaders.py +++ b/lazyllm/tools/rag/data_loaders.py @@ -12,10 +12,10 @@ def __init__(self, input_files: List[str], local_readers: Optional[Dict] = None, def load_data(self, input_files: Optional[List[str]] = None) -> List[DocNode]: input_files = input_files or self._input_files - + file_readers = {k: self._local_readers.get(k, self._global_readers.get(k)) + for k in self._local_readers.keys() | self._global_readers.keys()} LOG.info(f"DirectoryReader loads data, input files: {input_files}") - reader = SimpleDirectoryReader(input_files=input_files, - file_extractor={**self._local_readers, **self._global_readers}) + reader = SimpleDirectoryReader(input_files=input_files, file_extractor=file_readers) nodes: List[DocNode] = [] for doc in reader(): doc.group = LAZY_ROOT_NAME diff --git a/tests/basic_tests/test_rag_reader.py b/tests/basic_tests/test_rag_reader.py index 0910429a..d8c0b1e5 100644 --- a/tests/basic_tests/test_rag_reader.py +++ b/tests/basic_tests/test_rag_reader.py @@ -6,10 +6,20 @@ class YmlReader(ReaderBase): def _load_data(self, file, extra_info=None, fs=None): + try: + import yaml + except ImportError: + raise ImportError("yaml is required to read YAML file: `pip install pyyaml`") with open(file, 'r') as f: - data = f.read() + data = yaml.safe_load(f) + return [DocNode(text=data, metadata=extra_info or {})] +def processYml(file, extra_info=None): + with open(file, 'r') as f: + data = f.read() + return [DocNode(text=data, metadata=extra_info or {})] + class TestRagReader(object): @classmethod def setup_class(cls): @@ -37,8 +47,23 @@ def test_reader_dir(self): print(len(docs)) assert len(docs) == 13 - def test_register_reader(self): - self.doc.add_reader("/**/*.yml", YmlReader) + def test_register_local_reader(self): + self.doc.add_reader("/**/*.yml", processYml) + files = [os.path.join(self.datasets, "reader_test.yml")] + docs = self.doc._impl._impl.directory_reader.load_data(input_files=files) + assert len(docs) == 1 + + def test_register_global_reader(self): + Document.register_global_reader("/**/*.yml", processYml) files = [os.path.join(self.datasets, "reader_test.yml")] docs = self.doc._impl._impl.directory_reader.load_data(input_files=files) assert len(docs) == 1 + + def test_register_local_and_global_reader(self): + Document.register_global_reader("/**/*.yml", processYml) + self.doc.add_reader("/**/*.yml", YmlReader) + files = [os.path.join(self.datasets, "reader_test.yml")] + try: + self.doc._impl._impl.directory_reader.load_data(input_files=files) + except ImportError as e: + assert e == "yaml is required to read YAML file: `pip install pyyaml`" From e5d70eb68b8e67c4a37fef6d862b255303cd9f23 Mon Sep 17 00:00:00 2001 From: wangjian Date: Sat, 14 Sep 2024 20:16:26 +0800 Subject: [PATCH 08/11] modify reader usage priority --- lazyllm/tools/rag/dataReader.py | 5 +++-- lazyllm/tools/rag/data_loaders.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lazyllm/tools/rag/dataReader.py b/lazyllm/tools/rag/dataReader.py index ac2408a0..c2cc20f3 100644 --- a/lazyllm/tools/rag/dataReader.py +++ b/lazyllm/tools/rag/dataReader.py @@ -231,8 +231,9 @@ def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = N fs = fs or self._fs process_file = self._input_files - file_readers = {k: self._file_extractor.get(k, self.default_file_readers.get(k)) - for k in self._file_extractor.keys() | self.default_file_readers.keys()} + file_readers = self._file_extractor.copy() + for key, func in self.default_file_readers.items(): + if key not in file_readers: file_readers[key] = func if num_workers and num_workers >= 1: if num_workers > multiprocessing.cpu_count(): diff --git a/lazyllm/tools/rag/data_loaders.py b/lazyllm/tools/rag/data_loaders.py index 4d702484..61d0392a 100644 --- a/lazyllm/tools/rag/data_loaders.py +++ b/lazyllm/tools/rag/data_loaders.py @@ -12,8 +12,9 @@ def __init__(self, input_files: List[str], local_readers: Optional[Dict] = None, def load_data(self, input_files: Optional[List[str]] = None) -> List[DocNode]: input_files = input_files or self._input_files - file_readers = {k: self._local_readers.get(k, self._global_readers.get(k)) - for k in self._local_readers.keys() | self._global_readers.keys()} + file_readers = self._local_readers.copy() + for key, func in self._global_readers.items(): + if key not in file_readers: file_readers[key] = func LOG.info(f"DirectoryReader loads data, input files: {input_files}") reader = SimpleDirectoryReader(input_files=input_files, file_extractor=file_readers) nodes: List[DocNode] = [] From 0d70ed86d593b44de46c4af3894c5636f92271b5 Mon Sep 17 00:00:00 2001 From: wangjian Date: Sat, 14 Sep 2024 20:23:18 +0800 Subject: [PATCH 09/11] modify pattern in readers of unittests --- tests/basic_tests/test_rag_reader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/basic_tests/test_rag_reader.py b/tests/basic_tests/test_rag_reader.py index d8c0b1e5..bc044299 100644 --- a/tests/basic_tests/test_rag_reader.py +++ b/tests/basic_tests/test_rag_reader.py @@ -48,20 +48,20 @@ def test_reader_dir(self): assert len(docs) == 13 def test_register_local_reader(self): - self.doc.add_reader("/**/*.yml", processYml) + self.doc.add_reader("**/*.yml", processYml) files = [os.path.join(self.datasets, "reader_test.yml")] docs = self.doc._impl._impl.directory_reader.load_data(input_files=files) assert len(docs) == 1 def test_register_global_reader(self): - Document.register_global_reader("/**/*.yml", processYml) + Document.register_global_reader("**/*.yml", processYml) files = [os.path.join(self.datasets, "reader_test.yml")] docs = self.doc._impl._impl.directory_reader.load_data(input_files=files) assert len(docs) == 1 def test_register_local_and_global_reader(self): - Document.register_global_reader("/**/*.yml", processYml) - self.doc.add_reader("/**/*.yml", YmlReader) + Document.register_global_reader("**/*.yml", processYml) + self.doc.add_reader("**/*.yml", YmlReader) files = [os.path.join(self.datasets, "reader_test.yml")] try: self.doc._impl._impl.directory_reader.load_data(input_files=files) From 8b56a640a6abedea6d027aa594256403a9c7ca0c Mon Sep 17 00:00:00 2001 From: wangjian Date: Sat, 14 Sep 2024 23:09:22 +0800 Subject: [PATCH 10/11] modify reader test bug in unit tests --- tests/basic_tests/test_rag_reader.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/basic_tests/test_rag_reader.py b/tests/basic_tests/test_rag_reader.py index bc044299..30f44343 100644 --- a/tests/basic_tests/test_rag_reader.py +++ b/tests/basic_tests/test_rag_reader.py @@ -12,8 +12,9 @@ def _load_data(self, file, extra_info=None, fs=None): raise ImportError("yaml is required to read YAML file: `pip install pyyaml`") with open(file, 'r') as f: data = yaml.safe_load(f) - - return [DocNode(text=data, metadata=extra_info or {})] + node = DocNode(text=data, metadata=extra_info or {}) + node.text = "Call the class YmlReader." + return [node] def processYml(file, extra_info=None): with open(file, 'r') as f: @@ -63,7 +64,5 @@ def test_register_local_and_global_reader(self): Document.register_global_reader("**/*.yml", processYml) self.doc.add_reader("**/*.yml", YmlReader) files = [os.path.join(self.datasets, "reader_test.yml")] - try: - self.doc._impl._impl.directory_reader.load_data(input_files=files) - except ImportError as e: - assert e == "yaml is required to read YAML file: `pip install pyyaml`" + docs = self.doc._impl._impl.directory_reader.load_data(input_files=files) + assert docs[0].text == "Call the class YmlReader." From 36c967aee18e36367624bc9cf0c69042de153e79 Mon Sep 17 00:00:00 2001 From: wangjian Date: Sat, 14 Sep 2024 23:14:02 +0800 Subject: [PATCH 11/11] remove yaml package in rag reader of unittests --- tests/basic_tests/test_rag_reader.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/basic_tests/test_rag_reader.py b/tests/basic_tests/test_rag_reader.py index 30f44343..0d479fcc 100644 --- a/tests/basic_tests/test_rag_reader.py +++ b/tests/basic_tests/test_rag_reader.py @@ -6,12 +6,8 @@ class YmlReader(ReaderBase): def _load_data(self, file, extra_info=None, fs=None): - try: - import yaml - except ImportError: - raise ImportError("yaml is required to read YAML file: `pip install pyyaml`") with open(file, 'r') as f: - data = yaml.safe_load(f) + data = f.read() node = DocNode(text=data, metadata=extra_info or {}) node.text = "Call the class YmlReader." return [node]