Skip to content

Commit

Permalink
use unstructured io for pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
mawandm committed Jun 27, 2024
1 parent b8c3251 commit 4d4a20b
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 10 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/test_rag.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
sudo apt install ffmpeg tesseract-ocr -y
sudo apt update -y
sudo apt install ffmpeg tesseract-ocr poppler-utils -y
pip install -r nesis/rag/requirements.txt -r nesis/rag/requirements-test.txt -r nesis/rag/requirements-huggingface.txt
- name: Run unit tests
env:
Expand Down
2 changes: 1 addition & 1 deletion nesis/rag/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ RUN apt-get update \
&& apt-get clean \
&& addgroup --system --gid $GID $UNAME \
&& adduser --system --uid $UID --gid $GID --home /app --shell /bin/bash $UNAME \
&& apt install ffmpeg tesseract-ocr -y
&& apt install ffmpeg tesseract-ocr poppler-utils -y

WORKDIR /app

Expand Down
5 changes: 3 additions & 2 deletions nesis/rag/core/components/ingest/ingest_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@
TiffReader,
OdsReader,
ImageReader,
PdfReader,
)

logger = logging.getLogger(__name__)


FILE_READER_CLS: Dict[str, Type[BaseReader]] = {
".hwp": HWPReader,
".pdf": PDFReader,
".pdf": PdfReader,
".doc": DocxReader,
".docx": DocxReader,
".pptx": PptxReader,
Expand Down Expand Up @@ -77,7 +78,7 @@ def transform_file_into_documents(
def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
logger.debug("Transforming file_name=%s into documents", file_name)
extension = Path(file_name).suffix
reader_cls = FILE_READER_CLS.get(extension.lower())
reader_cls = FILE_READER_CLS.get(extension)
if reader_cls is None:
logger.debug(
"No reader found for extension=%s, using default string reader",
Expand Down
79 changes: 76 additions & 3 deletions nesis/rag/core/components/ingest/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from llama_index.core import Document
from llama_index.core.readers.base import BaseReader
from unstructured.partition.image import partition_image
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.xlsx import partition_xlsx


Expand All @@ -22,8 +23,16 @@ def _clean_metadata(metadata: Dict, exclusion_list: List[str] = None) -> Dict:
:return: the cleaned metadata
"""
metadata_copy = copy.deepcopy(metadata or {})
for exclusion_item in exclusion_list or []:
metadata_copy.pop(exclusion_item, None)
for metadata_item in list(metadata_copy.keys()):
metadata_value = metadata_copy.get(metadata_item)
if metadata_value is None:
continue
if isinstance(metadata_value, dict):
_clean_metadata_item = _clean_metadata(metadata_value, exclusion_list)
metadata_copy[metadata_item] = _clean_metadata_item
else:
for exclusion_item in exclusion_list or []:
metadata_copy.pop(exclusion_item, None)
return metadata_copy


Expand Down Expand Up @@ -81,7 +90,7 @@ def load_data(

class ImageReader(BaseReader):
"""
The llamaindex reader doesn't return any text so we use unstructured.io instead of llamaindex ImageReader.
The llamaindex reader doesn't return any text, so we use unstructured.io instead of llamaindex ImageReader.
"""

def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
Expand Down Expand Up @@ -159,3 +168,67 @@ def load_data(
documents += page_documents

return documents


class PdfReader(BaseReader):
"""
A simple PDF file reader.
"""

def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self._config = config or {}
self._metadata_exclusion_list: list[str] = (
self._config.get("metadata_exclusion_list")
or [
"file_directory",
"filename",
]
) + [
"text",
"file_name",
"coordinates",
"embedding",
"metadata_template",
"metadata_seperator",
"text_template",
"excluded_embed_metadata_keys",
"excluded_llm_metadata_keys",
"relationships",
"start_char_idx",
"end_char_idx",
]

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
elements = partition_pdf(
file.absolute(), strategy="hi_res", infer_table_structure=True
)
documents: List[Document] = []

for element in elements:
element_dict = element.to_dict()
element_text = element_dict["text"]
metadata = _clean_metadata(
{
**{
key: val
for key, val in element_dict.items()
if key not in ["text", "metadata"]
},
**element_dict["metadata"],
},
exclusion_list=self._metadata_exclusion_list,
)
document = Document(
text=element_text,
metadata={
**(extra_info or {}),
**metadata,
},
)
documents.append(document)
return documents
2 changes: 1 addition & 1 deletion nesis/rag/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ gevent==23.9.1
PyPDF2==3.0.1

chromadb==0.4.24
Werkzeug>=3.0.3
Werkzeug==3.0.1
pandas==2.2.1

injector==0.21.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from nesis.rag import tests
from nesis.rag.core.components.ingest.ingest_helper import IngestionHelper
from nesis.rag.core.components.ingest.readers import TiffReader
from nesis.rag.core.components.ingest.readers import TiffReader, PdfReader


@pytest.mark.parametrize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def settings() -> Settings:
@pytest.mark.parametrize(
"file_name",
[
"test2_transcript.txt",
# "test2_transcript.txt",
"file-sample_150kB.pdf",
],
)
Expand Down

0 comments on commit 4d4a20b

Please sign in to comment.