Skip to content

Commit

Permalink
TST: Improve test coverage by extracting texts (#998)
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored Jun 16, 2022
1 parent 034d7a9 commit 6ce36f7
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 11 deletions.
4 changes: 2 additions & 2 deletions PyPDF2/_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,12 @@ def merge(

# Gather all the pages that are going to be merged
for i in range(*pages):
pg = reader.pages[i]
page = reader.pages[i]

id = self.id_count
self.id_count += 1

mp = _MergedPage(pg, reader, id)
mp = _MergedPage(page, reader, id)

srcpages.append(mp)

Expand Down
16 changes: 11 additions & 5 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleOb
return retval


def getRectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
def getRectangle(
self: Any, name: str, defaults: Iterable[str]
) -> RectangleObject: # pragma: no cover
deprecate_no_replacement("getRectangle")
return _get_rectangle(self, name, defaults)

Expand All @@ -98,7 +100,9 @@ def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -
self[name] = value


def setRectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:
def setRectangle(
self: Any, name: str, value: Union[RectangleObject, float]
) -> None: # pragma: no cover
deprecate_no_replacement("setRectangle")
_set_rectangle(self, name, value)

Expand All @@ -107,7 +111,7 @@ def _delete_rectangle(self: Any, name: str) -> None:
del self[name]


def deleteRectangle(self: Any, name: str) -> None:
def deleteRectangle(self: Any, name: str) -> None: # pragma: no cover
deprecate_no_replacement("deleteRectangle")
del self[name]

Expand All @@ -120,7 +124,9 @@ def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:
)


def createRectangleAccessor(name: str, fallback: Iterable[str]) -> property:
def createRectangleAccessor(
name: str, fallback: Iterable[str]
) -> property: # pragma: no cover
deprecate_no_replacement("createRectangleAccessor")
return _create_rectangle_accessor(name, fallback)

Expand Down Expand Up @@ -1064,7 +1070,7 @@ def _extract_text_old(
text += "\n"
return text

def _debug_for_extract(self) -> str:
def _debug_for_extract(self) -> str: # pragma: no cover
out = ""
for ope, op in ContentStream(
self["/Contents"].getObject(), self.pdf, "bytes"
Expand Down
4 changes: 3 additions & 1 deletion PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1554,7 +1554,9 @@ def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]:
if (i + 1) >= len(array):
break

def read_next_end_line(self, stream: StreamType, limit_offset: int = 0) -> bytes:
def read_next_end_line(
self, stream: StreamType, limit_offset: int = 0
) -> bytes: # pragma: no cover
""".. deprecated:: 2.1.0"""
deprecate_no_replacement("read_next_end_line", removed_in="4.0.0")
line_parts = []
Expand Down
40 changes: 40 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from io import BytesIO

import pytest

from PyPDF2 import PdfReader
from PyPDF2.errors import PdfReadWarning

from . import get_pdf_from_url


def test_compute_space_width():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf"
name = "tika-923406.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


def test_parse_to_unicode_process_rg():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf"
name = "tika-959173.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=True)
for page in reader.pages:
page.extract_text()


def test_parse_encoding_advanced_encoding_not_implemented():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf"
name = "tika-957144.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
with pytest.warns(PdfReadWarning, match="Advanced encoding .* not implemented yet"):
for page in reader.pages:
page.extract_text()
42 changes: 41 additions & 1 deletion tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.constants import TypFitArguments as TF
from PyPDF2.errors import PdfReadError, PdfStreamError
from PyPDF2.errors import PdfReadError, PdfReadWarning, PdfStreamError
from PyPDF2.generic import (
ArrayObject,
Bookmark,
Expand All @@ -28,6 +28,8 @@
readStringFromStream,
)

from . import get_pdf_from_url

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "resources")
Expand Down Expand Up @@ -402,3 +404,41 @@ def test_remove_child_in_tree():
tree.remove_child(obj)
tree.add_child(obj, writer)
tree.emptyTree()


def test_dict_read_from_stream():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/984/984877.pdf"
name = "tika-984877.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
with pytest.warns(PdfReadWarning):
page.extract_text()


def test_parse_content_stream_peek_percentage():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/985/985770.pdf"
name = "tika-985770.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


def test_read_inline_image_no_has_q():
# pdf/df7e1add3156af17a372bc165e47a244.pdf
url = "https://corpora.tika.apache.org/base/docs/govdocs1/998/998719.pdf"
name = "tika-998719.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


def test_read_inline_image_loc_neg_1():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/935/935066.pdf"
name = "tika-935066.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()
40 changes: 40 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from PyPDF2 import PdfReader, Transformation
from PyPDF2._page import PageObject
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.errors import PdfReadWarning
from PyPDF2.generic import DictionaryObject, NameObject, RectangleObject

from . import get_pdf_from_url
Expand Down Expand Up @@ -218,3 +219,42 @@ def test_multi_language():
assert "你好世界" in txt, "Chinese not correctly extracted"
assert "สวัสดีชาวโลก" in txt, "Thai not correctly extracted"
assert "こんにちは世界" in txt, "Japanese not correctly extracted"


def test_extract_text_single_quote_op():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/964/964029.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf")))
for page in reader.pages:
page.extract_text()


@pytest.mark.parametrize(
("url", "name"),
[
# keyerror_potentially_empty_page
(
"https://corpora.tika.apache.org/base/docs/govdocs1/964/964029.pdf",
"tika-964029.pdf",
),
# 1140 / 1141:
(
"https://corpora.tika.apache.org/base/docs/govdocs1/932/932446.pdf",
"tika-932446.pdf",
),
],
)
def test_extract_text_page_pdf(url, name):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


def test_extract_text_page_pdf_impossible_decode_xform():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972962.pdf"
name = "tika-972962.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
with pytest.warns(
PdfReadWarning, match="impossible to decode XFormObject /Meta203"
):
for page in reader.pages:
page.extract_text()
40 changes: 38 additions & 2 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,7 @@ def test_get_attachments(src):
reader = PdfReader(src)

attachments = {}
for i in range(len(reader.pages)):
page = reader.pages[i]
for page in reader.pages:
if PG.ANNOTS in page:
for annotation in page[PG.ANNOTS]:
annotobj = annotation.get_object()
Expand Down Expand Up @@ -705,6 +704,15 @@ def test_read_path():
assert len(reader.pages) == 1


def test_read_not_binary_mode():
with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) as f:
msg = "PdfReader stream/file object is not in binary mode. It may not be read correctly."
with pytest.warns(PdfReadWarning, match=msg), pytest.raises(
io.UnsupportedOperation
):
PdfReader(f)


@pytest.mark.xfail(reason="#416")
def test_read_form_416():
url = (
Expand All @@ -713,3 +721,31 @@ def test_read_form_416():
reader = PdfReader(BytesIO(get_pdf_from_url(url, name="issue_416.pdf")))
fields = reader.get_form_text_fields()
assert len(fields) > 0


def test_extract_text_xref_issue_2():
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf"
msg = r"incorrect startxref pointer\(2\)"
with pytest.warns(PdfReadWarning, match=msg):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-981961.pdf")))
for page in reader.pages:
page.extract_text()


def test_extract_text_xref_issue_3():
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf"
msg = r"incorrect startxref pointer\(3\)"
with pytest.warns(PdfReadWarning, match=msg):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-977774.pdf")))
for page in reader.pages:
page.extract_text()


def test_extract_text_pdf15():
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
url = "https://corpora.tika.apache.org/base/docs/govdocs1/976/976030.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-976030.pdf")))
for page in reader.pages:
page.extract_text()

0 comments on commit 6ce36f7

Please sign in to comment.