Skip to content

Commit

Permalink
ENH: context manager for pdfreader (#2666)
Browse files Browse the repository at this point in the history
* Add minimal change

* Beautify

* Update some tests to use the context manager

* Implement feedback from CR

* Add test for context manager with stream

---------

Co-authored-by: pubpub-zz <[email protected]>
  • Loading branch information
tibor-reiss and pubpub-zz authored May 26, 2024
1 parent 08731fa commit b9920fa
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 29 deletions.
38 changes: 35 additions & 3 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import re
from io import BytesIO, UnsupportedOperation
from pathlib import Path
from types import TracebackType
from typing import (
Any,
Callable,
Expand All @@ -39,6 +40,7 @@
List,
Optional,
Tuple,
Type,
Union,
cast,
)
Expand Down Expand Up @@ -111,7 +113,13 @@ def __init__(
self.flattened_pages: Optional[List[PageObject]] = None
#: Storage of parsed PDF objects.
self.resolved_objects: Dict[Tuple[Any, Any], Optional[PdfObject]] = {}

self.xref_index = 0
self.xref: Dict[int, Dict[Any, Any]] = {}
self.xref_free_entry: Dict[int, Dict[Any, Any]] = {}
self.xref_objStm: Dict[int, Tuple[Any, Any]] = {}
self.trailer = DictionaryObject()

self._page_id2num: Optional[
Dict[Any, Any]
] = None # map page indirect_reference number to Page Number
Expand All @@ -121,9 +129,11 @@ def __init__(
"It may not be read correctly.",
__name__,
)
self._stream_opened = False
if isinstance(stream, (str, Path)):
with open(stream, "rb") as fh:
stream = BytesIO(fh.read())
self._stream_opened = True
self.read(stream)
self.stream = stream

Expand Down Expand Up @@ -153,6 +163,28 @@ def __init__(
elif password is not None:
raise PdfReadError("Not encrypted file")

def __enter__(self) -> "PdfReader":
return self

def __exit__(
self,
exc_type: Optional[Type[BaseException]],
exc_val: Optional[BaseException],
exc_tb: Optional[TracebackType],
) -> None:
self.close()

def close(self) -> None:
"""Close the stream if opened in __init__ and clear memory."""
if self._stream_opened:
self.stream.close()
self.flattened_pages = []
self.resolved_objects = {}
self.trailer = DictionaryObject()
self.xref = {}
self.xref_free_entry = {}
self.xref_objStm = {}

@property
def root_object(self) -> DictionaryObject:
"""Provide access to "/Root". Standardized with PdfWriter."""
Expand Down Expand Up @@ -776,9 +808,9 @@ def _read_standard_xref_table(self, stream: StreamType) -> None:
def _read_xref_tables_and_trailers(
self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int
) -> None:
self.xref: Dict[int, Dict[Any, Any]] = {}
self.xref_free_entry: Dict[int, Dict[Any, Any]] = {}
self.xref_objStm: Dict[int, Tuple[Any, Any]] = {}
self.xref = {}
self.xref_free_entry = {}
self.xref_objStm = {}
self.trailer = DictionaryObject()
while startxref is not None:
# load the xref table
Expand Down
86 changes: 60 additions & 26 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@
)
def test_get_num_pages(src, num_pages):
src = RESOURCE_ROOT / src
reader = PdfReader(src)
assert len(reader.pages) == num_pages
# from #1911
assert "/Size" in reader.trailer
with PdfReader(src) as reader:
assert len(reader.pages) == num_pages
# from #1911
assert "/Size" in reader.trailer


@pytest.mark.parametrize(
Expand Down Expand Up @@ -111,20 +111,20 @@ def test_read_metadata(pdf_path, expected):


def test_iss1943():
reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
docinfo = reader.metadata
docinfo.update(
{
NameObject("/CreationDate"): TextStringObject("D:20230705005151Z00'00'"),
NameObject("/ModDate"): TextStringObject("D:20230705005151Z00'00'"),
}
)
docinfo.creation_date
docinfo.creation_date_raw
docinfo.modification_date
docinfo.modification_date_raw
docinfo.update({NameObject("/CreationDate"): NumberObject(1)})
assert docinfo.creation_date is None
with PdfReader(RESOURCE_ROOT / "crazyones.pdf") as reader:
docinfo = reader.metadata
docinfo.update(
{
NameObject("/CreationDate"): TextStringObject("D:20230705005151Z00'00'"),
NameObject("/ModDate"): TextStringObject("D:20230705005151Z00'00'"),
}
)
docinfo.creation_date
docinfo.creation_date_raw
docinfo.modification_date
docinfo.modification_date_raw
docinfo.update({NameObject("/CreationDate"): NumberObject(1)})
assert docinfo.creation_date is None


@pytest.mark.samples()
Expand Down Expand Up @@ -152,14 +152,13 @@ def test_broken_meta_data(pdf_path):
],
)
def test_get_annotations(src):
reader = PdfReader(src)

for page in reader.pages:
if PG.ANNOTS in page:
for annot in page[PG.ANNOTS]:
subtype = annot.get_object()[IA.SUBTYPE]
if subtype == "/Text":
annot.get_object()[PG.CONTENTS]
with PdfReader(src) as reader:
for page in reader.pages:
if PG.ANNOTS in page:
for annot in page[PG.ANNOTS]:
subtype = annot.get_object()[IA.SUBTYPE]
if subtype == "/Text":
annot.get_object()[PG.CONTENTS]


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1543,3 +1542,38 @@ def test_looping_form(caplog):
flds2 = writer.get_fields()
assert "Text68.0 already parsed" in caplog.text
assert list(flds.keys()) == list(flds2.keys())


def test_context_manager_with_stream():
pdf_data = (
b"%%PDF-1.7\n"
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
b"2 0 obj << >> endobj\n"
b"3 0 obj << >> endobj\n"
b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
b" /Resources << /Font << >> >>"
b" /Rotate 0 /Type /Page >> endobj\n"
b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
b"xref 1 5\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"trailer << /Root 5 0 R /Size 6 >>\n"
b"startxref %d\n"
b"%%%%EOF"
)
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
pdf_data.find(b"xref") - 1,
)
pdf_stream = io.BytesIO(pdf_data)
with PdfReader(pdf_stream) as reader:
assert not reader.stream.closed
assert not pdf_stream.closed

0 comments on commit b9920fa

Please sign in to comment.