Merge branch 'main' into graphics_state_isolation

py-pdf · Dec 9, 2023 · 4cbd1ab · 4cbd1ab
2 parents 74dc3fd + 4aae547
commit 4cbd1ab
Show file tree

Hide file tree

Showing 22 changed files with 290 additions and 192 deletions.
diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
@@ -21,7 +21,7 @@ jobs:
       with:
         submodules: 'recursive'
     - name: Setup Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install requirements (Python 3)

diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
@@ -26,7 +26,7 @@ jobs:
       with:
         submodules: 'recursive'
     - name: Setup Python (3.11+)
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: 3.12  # latest stable python
         allow-prereleases: true
@@ -80,14 +80,14 @@ jobs:
         path: '**/tests/pdf_cache/*'
         key: cache-downloaded-files
     - name: Setup Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       if: matrix.python-version == '3.6' || matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
       with:
         python-version: ${{ matrix.python-version }}
         cache: 'pip'
         cache-dependency-path: '**/requirements/ci.txt'
     - name: Setup Python (3.11+)
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
       with:
         python-version: ${{ matrix.python-version }}
@@ -143,7 +143,7 @@ jobs:
       with:
         submodules: 'recursive'
     - name: Setup Python 3.11
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: "3.11"
         cache: 'pip'
@@ -171,7 +171,7 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: ${{env.PYTHON_LATEST}}
 
@@ -194,7 +194,7 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           # Use latest Python, so it understands all syntax.
           python-version: ${{env.PYTHON_LATEST}}

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -20,7 +20,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: 3.x
 

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -15,6 +15,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra
 * [ArkieCoder](https://github.com/ArkieCoder)
 * [Clauss, Christian](https://github.com/cclauss)
 * [DL6ER](https://github.com/DL6ER)
+* [Duy, Phan Thanh](https://github.com/zuypt)
 * [ediamondscience](https://github.com/ediamondscience)
 * [Ermeson, Felipe](https://github.com/FelipeErmeson)
 * [Freitag, François](https://github.com/francoisfreitag)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -6,7 +6,13 @@
 from ._codecs import adobe_glyphs, charset_encoding
 from ._utils import b_, logger_warning
 from .errors import PdfReadWarning
-from .generic import DecodedStreamObject, DictionaryObject, IndirectObject, NullObject, StreamObject
+from .generic import (
+    DecodedStreamObject,
+    DictionaryObject,
+    IndirectObject,
+    NullObject,
+    StreamObject,
+)
 
 
 # code freely inspired from @twiggy ; see #711

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -2240,7 +2240,10 @@ def clean_forms(
                 # to prevent infinite looping
                 return [], []  # pragma: no cover
             try:
-                d = cast(Dict[Any, Any], cast(DictionaryObject, elt["/Resources"])["/XObject"])
+                d = cast(
+                    Dict[Any, Any],
+                    cast(DictionaryObject, elt["/Resources"])["/XObject"],
+                )
             except KeyError:
                 d = {}
             images = []
@@ -3188,7 +3191,9 @@ def _add_articles_thread(
 
     def add_filtered_articles(
         self,
-        fltr: Union[Pattern[Any], str],  # thread entry from the reader's array of threads
+        fltr: Union[
+            Pattern[Any], str
+        ],  # thread entry from the reader's array of threads
         pages: Dict[int, PageObject],
         reader: PdfReader,
     ) -> None:

diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
@@ -91,10 +91,18 @@ def _get_imagemode(
         )
         return mode2, True
     elif color_space[0] == "/DeviceN":
+        original_color_space = color_space
         color_components = len(color_space[1])
         color_space = color_space[2]
         if isinstance(color_space, IndirectObject):  # pragma: no cover
             color_space = color_space.get_object()
+        if color_space == "/DeviceCMYK" and color_components == 1:
+            if original_color_space[1][0] != "/Black":
+                logger_warning(
+                    f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team",
+                    __name__,
+                )
+            return "L", True
         mode2, invert_color = _get_imagemode(
             color_space, color_components, prev_mode, depth + 1
         )

diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -558,13 +558,16 @@ class CCITTFaxDecode:
 
     @staticmethod
     def _get_parameters(
-        parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject], rows: int
+        parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject],
+        rows: int,
     ) -> CCITParameters:
         # TABLE 3.9 Optional parameters for the CCITTFaxDecode filter
         k = 0
         columns = 1728
         if parameters:
-            parameters_unwrapped = cast(Union[ArrayObject, DictionaryObject], parameters.get_object())
+            parameters_unwrapped = cast(
+                Union[ArrayObject, DictionaryObject], parameters.get_object()
+            )
             if isinstance(parameters_unwrapped, ArrayObject):
                 for decode_parm in parameters_unwrapped:
                     if CCITT.COLUMNS in decode_parm:
@@ -778,8 +781,8 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
     alpha = None
     filters = x_object_obj.get(SA.FILTER, [None])
     lfilters = filters[-1] if isinstance(filters, list) else filters
-    if lfilters == FT.FLATE_DECODE:
-        img, image_format, extension, invert_color = _handle_flate(
+    if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
+        img, image_format, extension, _ = _handle_flate(
             size,
             data,
             mode,
@@ -821,15 +824,14 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
             ".png",
             False,
         )
-
     # CMYK image and other colorspaces without decode
     # requires reverting scale (cf p243,2§ last sentence)
     decode = x_object_obj.get(
         IA.DECODE,
         ([1.0, 0.0] * len(img.getbands()))
         if (
-            (img.mode == "CMYK" or (invert_color and img.mode == "L"))
-            and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE)
+            (img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
+            or (invert_color and img.mode == "L")
         )
         else None,
     )

diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
@@ -727,7 +727,7 @@ def readFromStream(
 
 
 def encode_pdfdocencoding(unicode_string: str) -> bytes:
-    retval = b""
+    retval = bytearray()
     for c in unicode_string:
         try:
             retval += b_(chr(_pdfdoc_encoding_rev[c]))

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -420,7 +420,7 @@ def get_next_obj_pos(
             else:
                 return get_next_obj_pos(p, p1, rem_gens[1:], pdf)
 
-        def read_unsized_from_steam(
+        def read_unsized_from_stream(
             stream: StreamType, pdf: PdfReaderProtocol
         ) -> bytes:
             # we are just pointing at beginning of the stream
@@ -535,7 +535,7 @@ def read_unsized_from_steam(
                     data["__streamdata__"] = data["__streamdata__"][:-1]
                 elif pdf is not None and not pdf.strict:
                     stream.seek(pstart, 0)
-                    data["__streamdata__"] = read_unsized_from_steam(stream, pdf)
+                    data["__streamdata__"] = read_unsized_from_stream(stream, pdf)
                     pos = stream.tell()
                 else:
                     stream.seek(pos, 0)

diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt
@@ -69,6 +69,8 @@ pytest-timeout==2.1.0
     # via -r requirements/ci.in
 pytest-xdist==3.3.1
     # via -r requirements/ci.in
+pyyaml==6.0.1
+    # via -r requirements/ci.in
 ruff==0.0.290
     # via -r requirements/ci.in
 typeguard==4.1.2

diff --git a/requirements/ci.in b/requirements/ci.in
@@ -17,3 +17,4 @@ pytest-cov
 typeguard
 types-dataclasses
 types-Pillow
+pyyaml
diff --git a/requirements/ci.txt b/requirements/ci.txt
@@ -83,6 +83,8 @@ pytest-timeout==2.1.0
     # via -r requirements/ci.in
 pytest-xdist==3.0.2
     # via -r requirements/ci.in
+pyyaml==6.0.1
+    # via -r requirements/ci.in
 six==1.16.0
     # via flake8-print
 tomli==1.2.3

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,13 +1,16 @@
+import concurrent.futures
 import ssl
 import urllib.request
 from pathlib import Path
-from typing import List
+from typing import Dict, List, Optional
 from urllib.error import HTTPError
 
+import yaml
+
 from pypdf.generic import DictionaryObject, IndirectObject
 
 
-def get_data_from_url(url: str, name: str) -> bytes:
+def get_data_from_url(url: Optional[str] = None, name: Optional[str] = None) -> bytes:
     """
     Download a File from a URL and return its contents.
 
@@ -22,28 +25,33 @@ def get_data_from_url(url: str, name: str) -> bytes:
     Returns:
         Read File as bytes
     """
-    if url.startswith("file://"):
-        with open(url[7:].replace("\\", "/"), "rb") as fp:
-            return fp.read()
+    if name is None:
+        raise ValueError("A name must always be specified")
+
     cache_dir = Path(__file__).parent / "pdf_cache"
     if not cache_dir.exists():
         cache_dir.mkdir()
     cache_path = cache_dir / name
-    if not cache_path.exists():
-        ssl._create_default_https_context = ssl._create_unverified_context
-        cpt = 3
-        while cpt > 0:
-            try:
-                with urllib.request.urlopen(  # noqa: S310
-                    url
-                ) as response, cache_path.open("wb") as out_file:
-                    out_file.write(response.read())
-                cpt = 0
-            except HTTPError as e:
-                if cpt > 0:
-                    cpt -= 1
-                else:
-                    raise e
+
+    if url is not None:
+        if url.startswith("file://"):
+            with open(url[7:].replace("\\", "/"), "rb") as fp:
+                return fp.read()
+        if not cache_path.exists():
+            ssl._create_default_https_context = ssl._create_unverified_context
+            cpt = 3
+            while cpt > 0:
+                try:
+                    with urllib.request.urlopen(  # noqa: S310
+                        url
+                    ) as response, cache_path.open("wb") as out_file:
+                        out_file.write(response.read())
+                    cpt = 0
+                except HTTPError as e:
+                    if cpt > 0:
+                        cpt -= 1
+                    else:
+                        raise e
     with open(cache_path, "rb") as fp:
         data = fp.read()
     return data
@@ -106,12 +114,32 @@ def is_sublist(child_list, parent_list):
     return is_sublist(child_list, parent_list[1:])
 
 
+def read_yaml_to_list_of_dicts(yaml_file: Path) -> List[Dict[str, str]]:
+    with open(yaml_file) as yaml_input:
+        data = yaml.safe_load(yaml_input)
+    return data
+
+
 def download_test_pdfs():
     """
     Run this before the tests are executed to ensure you have everything locally.
 
     This is especially important to avoid pytest timeouts.
     """
-    pdfs = [("https://arxiv.org/pdf/2201.00214.pdf", "2201.00214.pdf")]
-    for url, name in pdfs:
-        get_data_from_url(url, name=name)
+    pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml")
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
+        futures = [
+            executor.submit(get_data_from_url, pdf["url"], name=pdf["local_filename"])
+            for pdf in pdfs
+        ]
+        concurrent.futures.wait(futures)
+
+
+def test_csv_consistency():
+    pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.csv")
+    # Ensure the names are unique
+    assert len(pdfs) == len({pdf["name"] for pdf in pdfs})
+
+    # Ensure the urls are unique
+    assert len(pdfs) == len({pdf["url"] for pdf in pdfs})