ENH: context manager for pdfreader (#2666)

* Add minimal change * Beautify * Update some tests to use the context manager * Implement feedback from CR * Add test for context manager with stream --------- Co-authored-by: pubpub-zz <[email protected]>
py-pdf · May 26, 2024 · b9920fa · b9920fa
1 parent 08731fa
commit b9920fa
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 29 deletions.
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -31,6 +31,7 @@
 import re
 from io import BytesIO, UnsupportedOperation
 from pathlib import Path
+from types import TracebackType
 from typing import (
     Any,
     Callable,
@@ -39,6 +40,7 @@
     List,
     Optional,
     Tuple,
+    Type,
     Union,
     cast,
 )
@@ -111,7 +113,13 @@ def __init__(
         self.flattened_pages: Optional[List[PageObject]] = None
         #: Storage of parsed PDF objects.
         self.resolved_objects: Dict[Tuple[Any, Any], Optional[PdfObject]] = {}
+
         self.xref_index = 0
+        self.xref: Dict[int, Dict[Any, Any]] = {}
+        self.xref_free_entry: Dict[int, Dict[Any, Any]] = {}
+        self.xref_objStm: Dict[int, Tuple[Any, Any]] = {}
+        self.trailer = DictionaryObject()
+
         self._page_id2num: Optional[
             Dict[Any, Any]
         ] = None  # map page indirect_reference number to Page Number
@@ -121,9 +129,11 @@ def __init__(
                 "It may not be read correctly.",
                 __name__,
             )
+        self._stream_opened = False
         if isinstance(stream, (str, Path)):
             with open(stream, "rb") as fh:
                 stream = BytesIO(fh.read())
+            self._stream_opened = True
         self.read(stream)
         self.stream = stream
 
@@ -153,6 +163,28 @@ def __init__(
         elif password is not None:
             raise PdfReadError("Not encrypted file")
 
+    def __enter__(self) -> "PdfReader":
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[TracebackType],
+    ) -> None:
+        self.close()
+
+    def close(self) -> None:
+        """Close the stream if opened in __init__ and clear memory."""
+        if self._stream_opened:
+            self.stream.close()
+        self.flattened_pages = []
+        self.resolved_objects = {}
+        self.trailer = DictionaryObject()
+        self.xref = {}
+        self.xref_free_entry = {}
+        self.xref_objStm = {}
+
     @property
     def root_object(self) -> DictionaryObject:
         """Provide access to "/Root". Standardized with PdfWriter."""
@@ -776,9 +808,9 @@ def _read_standard_xref_table(self, stream: StreamType) -> None:
     def _read_xref_tables_and_trailers(
         self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int
     ) -> None:
-        self.xref: Dict[int, Dict[Any, Any]] = {}
-        self.xref_free_entry: Dict[int, Dict[Any, Any]] = {}
-        self.xref_objStm: Dict[int, Tuple[Any, Any]] = {}
+        self.xref = {}
+        self.xref_free_entry = {}
+        self.xref_objStm = {}
         self.trailer = DictionaryObject()
         while startxref is not None:
             # load the xref table

diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -46,10 +46,10 @@
 )
 def test_get_num_pages(src, num_pages):
     src = RESOURCE_ROOT / src
-    reader = PdfReader(src)
-    assert len(reader.pages) == num_pages
-    # from #1911
-    assert "/Size" in reader.trailer
+    with PdfReader(src) as reader:
+        assert len(reader.pages) == num_pages
+        # from #1911
+        assert "/Size" in reader.trailer
 
 
 @pytest.mark.parametrize(
@@ -111,20 +111,20 @@ def test_read_metadata(pdf_path, expected):
 
 
 def test_iss1943():
-    reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
-    docinfo = reader.metadata
-    docinfo.update(
-        {
-            NameObject("/CreationDate"): TextStringObject("D:20230705005151Z00'00'"),
-            NameObject("/ModDate"): TextStringObject("D:20230705005151Z00'00'"),
-        }
-    )
-    docinfo.creation_date
-    docinfo.creation_date_raw
-    docinfo.modification_date
-    docinfo.modification_date_raw
-    docinfo.update({NameObject("/CreationDate"): NumberObject(1)})
-    assert docinfo.creation_date is None
+    with PdfReader(RESOURCE_ROOT / "crazyones.pdf") as reader:
+        docinfo = reader.metadata
+        docinfo.update(
+            {
+                NameObject("/CreationDate"): TextStringObject("D:20230705005151Z00'00'"),
+                NameObject("/ModDate"): TextStringObject("D:20230705005151Z00'00'"),
+            }
+        )
+        docinfo.creation_date
+        docinfo.creation_date_raw
+        docinfo.modification_date
+        docinfo.modification_date_raw
+        docinfo.update({NameObject("/CreationDate"): NumberObject(1)})
+        assert docinfo.creation_date is None
 
 
 @pytest.mark.samples()
@@ -152,14 +152,13 @@ def test_broken_meta_data(pdf_path):
     ],
 )
 def test_get_annotations(src):
-    reader = PdfReader(src)
-
-    for page in reader.pages:
-        if PG.ANNOTS in page:
-            for annot in page[PG.ANNOTS]:
-                subtype = annot.get_object()[IA.SUBTYPE]
-                if subtype == "/Text":
-                    annot.get_object()[PG.CONTENTS]
+    with PdfReader(src) as reader:
+        for page in reader.pages:
+            if PG.ANNOTS in page:
+                for annot in page[PG.ANNOTS]:
+                    subtype = annot.get_object()[IA.SUBTYPE]
+                    if subtype == "/Text":
+                        annot.get_object()[PG.CONTENTS]
 
 
 @pytest.mark.parametrize(
@@ -1543,3 +1542,38 @@ def test_looping_form(caplog):
     flds2 = writer.get_fields()
     assert "Text68.0 already parsed" in caplog.text
     assert list(flds.keys()) == list(flds2.keys())
+
+
+def test_context_manager_with_stream():
+    pdf_data = (
+        b"%%PDF-1.7\n"
+        b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
+        b"2 0 obj << >> endobj\n"
+        b"3 0 obj << >> endobj\n"
+        b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
+        b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
+        b" /Resources << /Font << >> >>"
+        b" /Rotate 0 /Type /Page >> endobj\n"
+        b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
+        b"xref 1 5\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"trailer << /Root 5 0 R /Size 6 >>\n"
+        b"startxref %d\n"
+        b"%%%%EOF"
+    )
+    pdf_data = pdf_data % (
+        pdf_data.find(b"1 0 obj"),
+        pdf_data.find(b"2 0 obj"),
+        pdf_data.find(b"3 0 obj"),
+        pdf_data.find(b"4 0 obj"),
+        pdf_data.find(b"5 0 obj"),
+        pdf_data.find(b"xref") - 1,
+    )
+    pdf_stream = io.BytesIO(pdf_data)
+    with PdfReader(pdf_stream) as reader:
+        assert not reader.stream.closed
+    assert not pdf_stream.closed