Skip to content

Commit

Permalink
community[patch]: add to pypdf tests and run in CI (langchain-ai#26663)
Browse files Browse the repository at this point in the history
  • Loading branch information
ccurme authored and Sheepsta300 committed Oct 1, 2024
1 parent 05d8699 commit c902a64
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 33 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
from pathlib import Path
from typing import Sequence, Union

Expand All @@ -11,7 +10,6 @@
PDFMinerPDFasHTMLLoader,
PyMuPDFLoader,
PyPDFium2Loader,
PyPDFLoader,
UnstructuredPDFLoader,
)

Expand Down Expand Up @@ -86,37 +84,6 @@ def test_pdfminer_pdf_as_html_loader() -> None:
assert len(docs) == 1


def test_pypdf_loader() -> None:
"""Test PyPDFLoader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PyPDFLoader(str(file_path))
docs = loader.load()

assert len(docs) == 1

file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PyPDFLoader(str(file_path))

docs = loader.load()
assert len(docs) == 16


def test_pypdf_loader_with_layout() -> None:
"""Test PyPDFLoader with layout mode."""
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PyPDFLoader(str(file_path), extraction_mode="layout")

docs = loader.load()
first_page = docs[0].page_content

expected = (
Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt"
).read_text(encoding="utf-8")
cleaned_first_page = re.sub(r"\x00", "", first_page)
cleaned_expected = re.sub(r"\x00", "", expected)
assert cleaned_first_page == cleaned_expected


def test_pypdfium2_loader() -> None:
"""Test PyPDFium2Loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
Expand Down
62 changes: 62 additions & 0 deletions libs/community/tests/unit_tests/document_loaders/test_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import re
from pathlib import Path

import pytest

from langchain_community.document_loaders import PyPDFLoader

path_to_simple_pdf = (
Path(__file__).parent.parent.parent / "integration_tests/examples/hello.pdf"
)
path_to_layout_pdf = (
Path(__file__).parent.parent
/ "document_loaders/sample_documents/layout-parser-paper.pdf"
)
path_to_layout_pdf_txt = (
Path(__file__).parent.parent.parent
/ "integration_tests/examples/layout-parser-paper-page-1.txt"
)


@pytest.mark.requires("pypdf")
def test_pypdf_loader() -> None:
"""Test PyPDFLoader."""
loader = PyPDFLoader(str(path_to_simple_pdf))
docs = loader.load()

assert len(docs) == 1

loader = PyPDFLoader(str(path_to_layout_pdf))

docs = loader.load()
assert len(docs) == 16
for page, doc in enumerate(docs):
assert doc.metadata["page"] == page
assert doc.metadata["source"].endswith("layout-parser-paper.pdf")
assert len(doc.page_content) > 10

first_page = docs[0].page_content
for expected in ["LayoutParser", "A Unified Toolkit"]:
assert expected in first_page


@pytest.mark.requires("pypdf")
def test_pypdf_loader_with_layout() -> None:
"""Test PyPDFLoader with layout mode."""
loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout")

docs = loader.load()
assert len(docs) == 16
for page, doc in enumerate(docs):
assert doc.metadata["page"] == page
assert doc.metadata["source"].endswith("layout-parser-paper.pdf")
assert len(doc.page_content) > 10

first_page = docs[0].page_content
for expected in ["LayoutParser", "A Unified Toolkit"]:
assert expected in first_page

expected = path_to_layout_pdf_txt.read_text(encoding="utf-8")
cleaned_first_page = re.sub(r"\x00", "", first_page)
cleaned_expected = re.sub(r"\x00", "", expected)
assert cleaned_first_page == cleaned_expected

0 comments on commit c902a64

Please sign in to comment.