From 4d4a20b8592f4881b705849e8a34a1f1f9448648 Mon Sep 17 00:00:00 2001 From: Michael Sekamanya Date: Thu, 27 Jun 2024 09:59:48 -0700 Subject: [PATCH] use unstructured io for pdf --- .github/workflows/test_rag.yml | 3 +- nesis/rag/Dockerfile | 2 +- .../core/components/ingest/ingest_helper.py | 5 +- nesis/rag/core/components/ingest/readers.py | 79 ++++++++++++++++++- nesis/rag/requirements.txt | 2 +- .../core/components/test_ingestion_helper.py | 2 +- .../core/server/ingest/test_ingest_router.py | 2 +- 7 files changed, 85 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test_rag.yml b/.github/workflows/test_rag.yml index fe064ce..d1a7b3b 100644 --- a/.github/workflows/test_rag.yml +++ b/.github/workflows/test_rag.yml @@ -63,7 +63,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - sudo apt install ffmpeg tesseract-ocr -y + sudo apt update -y + sudo apt install ffmpeg tesseract-ocr poppler-utils -y pip install -r nesis/rag/requirements.txt -r nesis/rag/requirements-test.txt -r nesis/rag/requirements-huggingface.txt - name: Run unit tests env: diff --git a/nesis/rag/Dockerfile b/nesis/rag/Dockerfile index e86ad8f..59ac945 100644 --- a/nesis/rag/Dockerfile +++ b/nesis/rag/Dockerfile @@ -31,7 +31,7 @@ RUN apt-get update \ && apt-get clean \ && addgroup --system --gid $GID $UNAME \ && adduser --system --uid $UID --gid $GID --home /app --shell /bin/bash $UNAME \ - && apt install ffmpeg tesseract-ocr -y + && apt install ffmpeg tesseract-ocr poppler-utils -y WORKDIR /app diff --git a/nesis/rag/core/components/ingest/ingest_helper.py b/nesis/rag/core/components/ingest/ingest_helper.py index 8da6e70..1e49514 100644 --- a/nesis/rag/core/components/ingest/ingest_helper.py +++ b/nesis/rag/core/components/ingest/ingest_helper.py @@ -24,6 +24,7 @@ TiffReader, OdsReader, ImageReader, + PdfReader, ) logger = logging.getLogger(__name__) @@ -31,7 +32,7 @@ FILE_READER_CLS: Dict[str, Type[BaseReader]] = { ".hwp": HWPReader, - ".pdf": PDFReader, + ".pdf": PdfReader, ".doc": DocxReader, ".docx": DocxReader, ".pptx": PptxReader, @@ -77,7 +78,7 @@ def transform_file_into_documents( def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]: logger.debug("Transforming file_name=%s into documents", file_name) extension = Path(file_name).suffix - reader_cls = FILE_READER_CLS.get(extension.lower()) + reader_cls = FILE_READER_CLS.get(extension) if reader_cls is None: logger.debug( "No reader found for extension=%s, using default string reader", diff --git a/nesis/rag/core/components/ingest/readers.py b/nesis/rag/core/components/ingest/readers.py index ed2c3ef..ea32907 100644 --- a/nesis/rag/core/components/ingest/readers.py +++ b/nesis/rag/core/components/ingest/readers.py @@ -10,6 +10,7 @@ from llama_index.core import Document from llama_index.core.readers.base import BaseReader from unstructured.partition.image import partition_image +from unstructured.partition.pdf import partition_pdf from unstructured.partition.xlsx import partition_xlsx @@ -22,8 +23,16 @@ def _clean_metadata(metadata: Dict, exclusion_list: List[str] = None) -> Dict: :return: the cleaned metadata """ metadata_copy = copy.deepcopy(metadata or {}) - for exclusion_item in exclusion_list or []: - metadata_copy.pop(exclusion_item, None) + for metadata_item in list(metadata_copy.keys()): + metadata_value = metadata_copy.get(metadata_item) + if metadata_value is None: + continue + if isinstance(metadata_value, dict): + _clean_metadata_item = _clean_metadata(metadata_value, exclusion_list) + metadata_copy[metadata_item] = _clean_metadata_item + else: + for exclusion_item in exclusion_list or []: + metadata_copy.pop(exclusion_item, None) return metadata_copy @@ -81,7 +90,7 @@ def load_data( class ImageReader(BaseReader): """ - The llamaindex reader doesn't return any text so we use unstructured.io instead of llamaindex ImageReader. + The llamaindex reader doesn't return any text, so we use unstructured.io instead of llamaindex ImageReader. """ def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: @@ -159,3 +168,67 @@ def load_data( documents += page_documents return documents + + +class PdfReader(BaseReader): + """ + A simple PDF file reader. + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: + self._config = config or {} + self._metadata_exclusion_list: list[str] = ( + self._config.get("metadata_exclusion_list") + or [ + "file_directory", + "filename", + ] + ) + [ + "text", + "file_name", + "coordinates", + "embedding", + "metadata_template", + "metadata_seperator", + "text_template", + "excluded_embed_metadata_keys", + "excluded_llm_metadata_keys", + "relationships", + "start_char_idx", + "end_char_idx", + ] + + def load_data( + self, + file: Path, + extra_info: Optional[Dict] = None, + fs: Optional[AbstractFileSystem] = None, + ) -> List[Document]: + elements = partition_pdf( + file.absolute(), strategy="hi_res", infer_table_structure=True + ) + documents: List[Document] = [] + + for element in elements: + element_dict = element.to_dict() + element_text = element_dict["text"] + metadata = _clean_metadata( + { + **{ + key: val + for key, val in element_dict.items() + if key not in ["text", "metadata"] + }, + **element_dict["metadata"], + }, + exclusion_list=self._metadata_exclusion_list, + ) + document = Document( + text=element_text, + metadata={ + **(extra_info or {}), + **metadata, + }, + ) + documents.append(document) + return documents diff --git a/nesis/rag/requirements.txt b/nesis/rag/requirements.txt index b5599c1..2e1a89d 100644 --- a/nesis/rag/requirements.txt +++ b/nesis/rag/requirements.txt @@ -13,7 +13,7 @@ gevent==23.9.1 PyPDF2==3.0.1 chromadb==0.4.24 -Werkzeug>=3.0.3 +Werkzeug==3.0.1 pandas==2.2.1 injector==0.21.0 diff --git a/nesis/rag/tests/rag/core/components/test_ingestion_helper.py b/nesis/rag/tests/rag/core/components/test_ingestion_helper.py index 0120a35..df9e01e 100644 --- a/nesis/rag/tests/rag/core/components/test_ingestion_helper.py +++ b/nesis/rag/tests/rag/core/components/test_ingestion_helper.py @@ -8,7 +8,7 @@ from nesis.rag import tests from nesis.rag.core.components.ingest.ingest_helper import IngestionHelper -from nesis.rag.core.components.ingest.readers import TiffReader +from nesis.rag.core.components.ingest.readers import TiffReader, PdfReader @pytest.mark.parametrize( diff --git a/nesis/rag/tests/rag/core/server/ingest/test_ingest_router.py b/nesis/rag/tests/rag/core/server/ingest/test_ingest_router.py index aa0b337..7b044ef 100644 --- a/nesis/rag/tests/rag/core/server/ingest/test_ingest_router.py +++ b/nesis/rag/tests/rag/core/server/ingest/test_ingest_router.py @@ -35,7 +35,7 @@ def settings() -> Settings: @pytest.mark.parametrize( "file_name", [ - "test2_transcript.txt", + # "test2_transcript.txt", "file-sample_150kB.pdf", ], )