Skip to content

Commit

Permalink
Merge branch 'main' into graphics_state_isolation
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored Dec 9, 2023
2 parents 74dc3fd + 4aae547 commit 4cbd1ab
Show file tree
Hide file tree
Showing 22 changed files with 290 additions and 192 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
with:
submodules: 'recursive'
- name: Setup Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install requirements (Python 3)
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/github-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
with:
submodules: 'recursive'
- name: Setup Python (3.11+)
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: 3.12 # latest stable python
allow-prereleases: true
Expand Down Expand Up @@ -80,14 +80,14 @@ jobs:
path: '**/tests/pdf_cache/*'
key: cache-downloaded-files
- name: Setup Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
if: matrix.python-version == '3.6' || matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: '**/requirements/ci.txt'
- name: Setup Python (3.11+)
uses: actions/setup-python@v4
uses: actions/setup-python@v5
if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
with:
python-version: ${{ matrix.python-version }}
Expand Down Expand Up @@ -143,7 +143,7 @@ jobs:
with:
submodules: 'recursive'
- name: Setup Python 3.11
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: 'pip'
Expand Down Expand Up @@ -171,7 +171,7 @@ jobs:

steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: ${{env.PYTHON_LATEST}}

Expand All @@ -194,7 +194,7 @@ jobs:

steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
# Use latest Python, so it understands all syntax.
python-version: ${{env.PYTHON_LATEST}}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: 3.x

Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra
* [ArkieCoder](https://github.com/ArkieCoder)
* [Clauss, Christian](https://github.com/cclauss)
* [DL6ER](https://github.com/DL6ER)
* [Duy, Phan Thanh](https://github.com/zuypt)
* [ediamondscience](https://github.com/ediamondscience)
* [Ermeson, Felipe](https://github.com/FelipeErmeson)
* [Freitag, François](https://github.com/francoisfreitag)
Expand Down
8 changes: 7 additions & 1 deletion pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@
from ._codecs import adobe_glyphs, charset_encoding
from ._utils import b_, logger_warning
from .errors import PdfReadWarning
from .generic import DecodedStreamObject, DictionaryObject, IndirectObject, NullObject, StreamObject
from .generic import (
DecodedStreamObject,
DictionaryObject,
IndirectObject,
NullObject,
StreamObject,
)


# code freely inspired from @twiggy ; see #711
Expand Down
9 changes: 7 additions & 2 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2240,7 +2240,10 @@ def clean_forms(
# to prevent infinite looping
return [], [] # pragma: no cover
try:
d = cast(Dict[Any, Any], cast(DictionaryObject, elt["/Resources"])["/XObject"])
d = cast(
Dict[Any, Any],
cast(DictionaryObject, elt["/Resources"])["/XObject"],
)
except KeyError:
d = {}
images = []
Expand Down Expand Up @@ -3188,7 +3191,9 @@ def _add_articles_thread(

def add_filtered_articles(
self,
fltr: Union[Pattern[Any], str], # thread entry from the reader's array of threads
fltr: Union[
Pattern[Any], str
], # thread entry from the reader's array of threads
pages: Dict[int, PageObject],
reader: PdfReader,
) -> None:
Expand Down
8 changes: 8 additions & 0 deletions pypdf/_xobj_image_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,18 @@ def _get_imagemode(
)
return mode2, True
elif color_space[0] == "/DeviceN":
original_color_space = color_space
color_components = len(color_space[1])
color_space = color_space[2]
if isinstance(color_space, IndirectObject): # pragma: no cover
color_space = color_space.get_object()
if color_space == "/DeviceCMYK" and color_components == 1:
if original_color_space[1][0] != "/Black":
logger_warning(
f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team",
__name__,
)
return "L", True
mode2, invert_color = _get_imagemode(
color_space, color_components, prev_mode, depth + 1
)
Expand Down
16 changes: 9 additions & 7 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,13 +558,16 @@ class CCITTFaxDecode:

@staticmethod
def _get_parameters(
parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject], rows: int
parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject],
rows: int,
) -> CCITParameters:
# TABLE 3.9 Optional parameters for the CCITTFaxDecode filter
k = 0
columns = 1728
if parameters:
parameters_unwrapped = cast(Union[ArrayObject, DictionaryObject], parameters.get_object())
parameters_unwrapped = cast(
Union[ArrayObject, DictionaryObject], parameters.get_object()
)
if isinstance(parameters_unwrapped, ArrayObject):
for decode_parm in parameters_unwrapped:
if CCITT.COLUMNS in decode_parm:
Expand Down Expand Up @@ -778,8 +781,8 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
alpha = None
filters = x_object_obj.get(SA.FILTER, [None])
lfilters = filters[-1] if isinstance(filters, list) else filters
if lfilters == FT.FLATE_DECODE:
img, image_format, extension, invert_color = _handle_flate(
if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
img, image_format, extension, _ = _handle_flate(
size,
data,
mode,
Expand Down Expand Up @@ -821,15 +824,14 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
".png",
False,
)

# CMYK image and other colorspaces without decode
# requires reverting scale (cf p243,2§ last sentence)
decode = x_object_obj.get(
IA.DECODE,
([1.0, 0.0] * len(img.getbands()))
if (
(img.mode == "CMYK" or (invert_color and img.mode == "L"))
and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE)
(img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
or (invert_color and img.mode == "L")
)
else None,
)
Expand Down
2 changes: 1 addition & 1 deletion pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,7 @@ def readFromStream(


def encode_pdfdocencoding(unicode_string: str) -> bytes:
retval = b""
retval = bytearray()
for c in unicode_string:
try:
retval += b_(chr(_pdfdoc_encoding_rev[c]))
Expand Down
4 changes: 2 additions & 2 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ def get_next_obj_pos(
else:
return get_next_obj_pos(p, p1, rem_gens[1:], pdf)

def read_unsized_from_steam(
def read_unsized_from_stream(
stream: StreamType, pdf: PdfReaderProtocol
) -> bytes:
# we are just pointing at beginning of the stream
Expand Down Expand Up @@ -535,7 +535,7 @@ def read_unsized_from_steam(
data["__streamdata__"] = data["__streamdata__"][:-1]
elif pdf is not None and not pdf.strict:
stream.seek(pstart, 0)
data["__streamdata__"] = read_unsized_from_steam(stream, pdf)
data["__streamdata__"] = read_unsized_from_stream(stream, pdf)
pos = stream.tell()
else:
stream.seek(pos, 0)
Expand Down
2 changes: 2 additions & 0 deletions requirements/ci-3.11.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ pytest-timeout==2.1.0
# via -r requirements/ci.in
pytest-xdist==3.3.1
# via -r requirements/ci.in
pyyaml==6.0.1
# via -r requirements/ci.in
ruff==0.0.290
# via -r requirements/ci.in
typeguard==4.1.2
Expand Down
1 change: 1 addition & 0 deletions requirements/ci.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ pytest-cov
typeguard
types-dataclasses
types-Pillow
pyyaml
2 changes: 2 additions & 0 deletions requirements/ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ pytest-timeout==2.1.0
# via -r requirements/ci.in
pytest-xdist==3.0.2
# via -r requirements/ci.in
pyyaml==6.0.1
# via -r requirements/ci.in
six==1.16.0
# via flake8-print
tomli==1.2.3
Expand Down
74 changes: 51 additions & 23 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import concurrent.futures
import ssl
import urllib.request
from pathlib import Path
from typing import List
from typing import Dict, List, Optional
from urllib.error import HTTPError

import yaml

from pypdf.generic import DictionaryObject, IndirectObject


def get_data_from_url(url: str, name: str) -> bytes:
def get_data_from_url(url: Optional[str] = None, name: Optional[str] = None) -> bytes:
"""
Download a File from a URL and return its contents.
Expand All @@ -22,28 +25,33 @@ def get_data_from_url(url: str, name: str) -> bytes:
Returns:
Read File as bytes
"""
if url.startswith("file://"):
with open(url[7:].replace("\\", "/"), "rb") as fp:
return fp.read()
if name is None:
raise ValueError("A name must always be specified")

cache_dir = Path(__file__).parent / "pdf_cache"
if not cache_dir.exists():
cache_dir.mkdir()
cache_path = cache_dir / name
if not cache_path.exists():
ssl._create_default_https_context = ssl._create_unverified_context
cpt = 3
while cpt > 0:
try:
with urllib.request.urlopen( # noqa: S310
url
) as response, cache_path.open("wb") as out_file:
out_file.write(response.read())
cpt = 0
except HTTPError as e:
if cpt > 0:
cpt -= 1
else:
raise e

if url is not None:
if url.startswith("file://"):
with open(url[7:].replace("\\", "/"), "rb") as fp:
return fp.read()
if not cache_path.exists():
ssl._create_default_https_context = ssl._create_unverified_context
cpt = 3
while cpt > 0:
try:
with urllib.request.urlopen( # noqa: S310
url
) as response, cache_path.open("wb") as out_file:
out_file.write(response.read())
cpt = 0
except HTTPError as e:
if cpt > 0:
cpt -= 1
else:
raise e
with open(cache_path, "rb") as fp:
data = fp.read()
return data
Expand Down Expand Up @@ -106,12 +114,32 @@ def is_sublist(child_list, parent_list):
return is_sublist(child_list, parent_list[1:])


def read_yaml_to_list_of_dicts(yaml_file: Path) -> List[Dict[str, str]]:
with open(yaml_file) as yaml_input:
data = yaml.safe_load(yaml_input)
return data


def download_test_pdfs():
"""
Run this before the tests are executed to ensure you have everything locally.
This is especially important to avoid pytest timeouts.
"""
pdfs = [("https://arxiv.org/pdf/2201.00214.pdf", "2201.00214.pdf")]
for url, name in pdfs:
get_data_from_url(url, name=name)
pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml")

with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = [
executor.submit(get_data_from_url, pdf["url"], name=pdf["local_filename"])
for pdf in pdfs
]
concurrent.futures.wait(futures)


def test_csv_consistency():
pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.csv")
# Ensure the names are unique
assert len(pdfs) == len({pdf["name"] for pdf in pdfs})

# Ensure the urls are unique
assert len(pdfs) == len({pdf["url"] for pdf in pdfs})
Loading

0 comments on commit 4cbd1ab

Please sign in to comment.