Skip to content

Commit

Permalink
TST: Image extraction (#1077)
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored Jul 9, 2022
1 parent 67d6e09 commit 67d962d
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ PyPDF2_pdfLocation.txt
.python-version
tests/pdf_cache/
docs/meta/CHANGELOG.md
extracted-images/
36 changes: 36 additions & 0 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
import os
import sys
from io import BytesIO
from pathlib import Path

import pytest

from PyPDF2 import PdfMerger, PdfReader, PdfWriter
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError, PdfReadWarning
from PyPDF2.filters import _xobj_to_image

from . import get_pdf_from_url

Expand Down Expand Up @@ -372,3 +376,35 @@ def test_merge_output():

# Cleanup
merger.close()


def test_image_extraction():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994636.pdf"
name = "tika-994636.pdf"
data = BytesIO(get_pdf_from_url(url, name=name))
reader = PdfReader(data)

images_extracted = []
root = Path("extracted-images")
if not root.exists():
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
if do_cleanup:
for filepath in images_extracted:
if os.path.exists(filepath):
os.remove(filepath)

0 comments on commit 67d962d

Please sign in to comment.