Don’t break reproducible PDF creation when using images

Fix #1666. Note that the added test doesn’t test this problem, because the "hash" function is stable when we use the same process. If someone finds a nice way to solve this…
Kozea · Jun 29, 2022 · 00d5b03 · 00d5b03
1 parent 5486875
commit 00d5b03
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 3 deletions.
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -358,6 +358,7 @@ def test_command_line_render(tmpdir):
     assert tmpdir.join('out13.pdf').read_binary() == rotated_pdf_bytes
     assert tmpdir.join('out14.pdf').read_binary() == rotated_pdf_bytes
 
+    os.environ['SOURCE_DATE_EPOCH'] = '0'
     _run('not_optimized.html out15.pdf')
     _run('not_optimized.html out16.pdf -O images')
     _run('not_optimized.html out17.pdf -O fonts')
@@ -366,12 +367,20 @@ def test_command_line_render(tmpdir):
     _run('not_optimized.html out20.pdf -O none')
     _run('not_optimized.html out21.pdf -O none -O all')
     _run('not_optimized.html out22.pdf -O all -O none')
-    # TODO: test that equivalent CLI options give equivalent PDF sizes,
-    # unfortunately font optimization makes PDF generation not reproducible
     assert (
         len(tmpdir.join('out16.pdf').read_binary()) <
         len(tmpdir.join('out15.pdf').read_binary()) <
         len(tmpdir.join('out20.pdf').read_binary()))
+    assert len({
+        tmpdir.join(f'out{i}.pdf').read_binary()
+        for i in (16, 18, 19, 21)}) == 1
+    assert len({
+        tmpdir.join(f'out{i}.pdf').read_binary()
+        for i in (15, 17)}) == 1
+    assert len({
+        tmpdir.join(f'out{i}.pdf').read_binary()
+        for i in (20, 22)}) == 1
+    os.environ.pop('SOURCE_DATE_EPOCH')
 
     stdout = _run('combined.html -')
     assert stdout.count(b'attachment') == 0
@@ -450,6 +459,14 @@ def test_partial_pdf_custom_metadata():
     assert b'value' in stdout
 
 
+def test_reproducible():
+    os.environ['SOURCE_DATE_EPOCH'] = '0'
+    stdout1 = _run('- -', b'<body>a<img src=pattern.png>')
+    stdout2 = _run('- -', b'<body>a<img src=pattern.png>')
+    os.environ.pop('SOURCE_DATE_EPOCH')
+    assert stdout1 == stdout2
+
+
 @assert_no_logs
 def test_unicode_filenames(assert_pixels_equal, tmpdir):
     """Test non-ASCII filenames both in Unicode or bytes form."""

diff --git a/weasyprint/images.py b/weasyprint/images.py
@@ -1,6 +1,7 @@
 """Fetch and decode images in various formats."""
 
 import math
+from hashlib import md5
 from io import BytesIO
 from itertools import cycle
 from math import inf
@@ -129,7 +130,7 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
                             raster_exception)
                 else:
                     # Store image id to enable cache in Stream.add_image
-                    image_id = hash(url)
+                    image_id = md5(url.encode()).hexdigest()
                     image = RasterImage(pillow_image, image_id, optimize_size)
 
     except (URLFetchingError, ImageLoadingError) as exception: