Skip to content

Commit

Permalink
[Community]: Image Extraction Fixed for PDFPlumberParser (#28491)
Browse files Browse the repository at this point in the history
- **Description:** One-Bit Images was raising error which has been fixed
in this PR for `PDFPlumberParser`
 - **Issue:** #28480

---------

Co-authored-by: Chester Curme <[email protected]>
  • Loading branch information
keenborder786 and ccurme authored Dec 18, 2024
1 parent f723a84 commit d49df48
Showing 1 changed file with 24 additions and 4 deletions.
28 changes: 24 additions & 4 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,13 @@ def __init__(
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
"""
try:
import PIL # noqa:F401
except ImportError:
raise ImportError(
"pillow package not found, please install it with"
" `pip install pillow`"
)
self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe
self.extract_images = extract_images
Expand Down Expand Up @@ -468,17 +475,30 @@ def _process_page_content(self, page: pdfplumber.page.Page) -> str:

def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
"""Extract images from page and get the text with RapidOCR."""
from PIL import Image

if not self.extract_images:
return ""

images = []
for img in page.images:
if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
images.append(
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
img["stream"]["Height"], img["stream"]["Width"], -1
if img["stream"]["BitsPerComponent"] == 1:
images.append(
np.array(
Image.frombytes(
"1",
(img["stream"]["Width"], img["stream"]["Height"]),
img["stream"].get_data(),
).convert("L")
)
)
else:
images.append(
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
img["stream"]["Height"], img["stream"]["Width"], -1
)
)
)
elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
images.append(img["stream"].get_data())
else:
Expand Down

0 comments on commit d49df48

Please sign in to comment.