Skip to content

Commit

Permalink
Don’t break reproducible PDF creation when using images
Browse files Browse the repository at this point in the history
Fix #1666.

Note that the added test doesn’t test this problem, because the "hash" function
is stable when we use the same process. If someone finds a nice way to solve
this…
  • Loading branch information
liZe committed Jun 29, 2022
1 parent 5486875 commit 00d5b03
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
21 changes: 19 additions & 2 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ def test_command_line_render(tmpdir):
assert tmpdir.join('out13.pdf').read_binary() == rotated_pdf_bytes
assert tmpdir.join('out14.pdf').read_binary() == rotated_pdf_bytes

os.environ['SOURCE_DATE_EPOCH'] = '0'
_run('not_optimized.html out15.pdf')
_run('not_optimized.html out16.pdf -O images')
_run('not_optimized.html out17.pdf -O fonts')
Expand All @@ -366,12 +367,20 @@ def test_command_line_render(tmpdir):
_run('not_optimized.html out20.pdf -O none')
_run('not_optimized.html out21.pdf -O none -O all')
_run('not_optimized.html out22.pdf -O all -O none')
# TODO: test that equivalent CLI options give equivalent PDF sizes,
# unfortunately font optimization makes PDF generation not reproducible
assert (
len(tmpdir.join('out16.pdf').read_binary()) <
len(tmpdir.join('out15.pdf').read_binary()) <
len(tmpdir.join('out20.pdf').read_binary()))
assert len({
tmpdir.join(f'out{i}.pdf').read_binary()
for i in (16, 18, 19, 21)}) == 1
assert len({
tmpdir.join(f'out{i}.pdf').read_binary()
for i in (15, 17)}) == 1
assert len({
tmpdir.join(f'out{i}.pdf').read_binary()
for i in (20, 22)}) == 1
os.environ.pop('SOURCE_DATE_EPOCH')

stdout = _run('combined.html -')
assert stdout.count(b'attachment') == 0
Expand Down Expand Up @@ -450,6 +459,14 @@ def test_partial_pdf_custom_metadata():
assert b'value' in stdout


def test_reproducible():
os.environ['SOURCE_DATE_EPOCH'] = '0'
stdout1 = _run('- -', b'<body>a<img src=pattern.png>')
stdout2 = _run('- -', b'<body>a<img src=pattern.png>')
os.environ.pop('SOURCE_DATE_EPOCH')
assert stdout1 == stdout2


@assert_no_logs
def test_unicode_filenames(assert_pixels_equal, tmpdir):
"""Test non-ASCII filenames both in Unicode or bytes form."""
Expand Down
3 changes: 2 additions & 1 deletion weasyprint/images.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Fetch and decode images in various formats."""

import math
from hashlib import md5
from io import BytesIO
from itertools import cycle
from math import inf
Expand Down Expand Up @@ -129,7 +130,7 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
raster_exception)
else:
# Store image id to enable cache in Stream.add_image
image_id = hash(url)
image_id = md5(url.encode()).hexdigest()
image = RasterImage(pillow_image, image_id, optimize_size)

except (URLFetchingError, ImageLoadingError) as exception:
Expand Down

0 comments on commit 00d5b03

Please sign in to comment.