Skip to content

Commit

Permalink
TST: Image extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Jul 9, 2022
1 parent f776f54 commit c3515d4
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
1 change: 1 addition & 0 deletions PyPDF2/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
IA.COLOR_SPACE in x_object_obj
and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
):
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
mode: Literal["RGB", "P"] = "RGB"
else:
mode = "P"
Expand Down
33 changes: 30 additions & 3 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,9 +378,36 @@ def test_merge_output():
merger.close()


def test_image_extraction():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994636.pdf"
name = "tika-994636.pdf"
@pytest.mark.parametrize(
("url", "name"),
[
(
"https://corpora.tika.apache.org/base/docs/govdocs1/994/994636.pdf",
"tika-994636.pdf",
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/952/952133.pdf",
"tika-952133.pdf",
),
( # JPXDecode
"https://corpora.tika.apache.org/base/docs/govdocs1/914/914568.pdf",
"tika-914568.pdf",
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/952/952016.pdf",
"tika-952016.pdf",
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/965/965118.pdf",
"tika-952016.pdf",
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/959/959184.pdf",
"tika-959184.pdf",
),
],
)
def test_image_extraction(url, name):
data = BytesIO(get_pdf_from_url(url, name=name))
reader = PdfReader(data)

Expand Down

0 comments on commit c3515d4

Please sign in to comment.