diff --git a/.flake8 b/.flake8 index 95a1289f4..44091fbb1 100644 --- a/.flake8 +++ b/.flake8 @@ -4,4 +4,4 @@ ignore = E203,E501,E741,W503,W604,N817,N814,VNE001,VNE002,VNE003,N802,SIM105,P101 exclude = build,sample-files per-file-ignores = - tests/*: ASS001,PT011 + tests/*: ASS001,PT011,B011 diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index 62d309402..e4599bc74 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -913,16 +913,8 @@ def _sweep_indirect_references( if newobj is None: try: newobj = data.pdf.get_object(data) - hash_value = None - if newobj is not None: - hash_value = newobj.hash_value() - # Check if object is already added to pdf. - if hash_value in self._idnum_hash: - return IndirectObject(self._idnum_hash[hash_value], 0, self) self._objects.append(None) # placeholder idnum = len(self._objects) - if hash_value is not None: - self._idnum_hash[hash_value] = idnum newobj_ido = IndirectObject(idnum, 0, self) if data.pdf not in extern_map: extern_map[data.pdf] = {} diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf new file mode 100644 index 000000000..aff79f1a4 Binary files /dev/null and b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf differ diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 8ad201b32..28d52552c 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -244,6 +244,9 @@ def test_merge_with_warning(url, name): ): merger.write("tmp.merged.pdf") + # Cleanup + os.remove("tmp.merged.pdf") + @pytest.mark.parametrize( ("url", "name"), @@ -261,6 +264,9 @@ def test_merge(url, name): merger.append(reader) merger.write("tmp.merged.pdf") + # Cleanup + os.remove("tmp.merged.pdf") + @pytest.mark.parametrize( ("url", "name"), @@ -335,3 +341,34 @@ def test_scale_rectangle_indirect_object(): for page in reader.pages: page.scale(sx=2, sy=3) + + +def test_merge_output(): + # Arrange + base = os.path.join(RESOURCE_ROOT, "Seige_of_Vicksburg_Sample_OCR.pdf") + crazy = os.path.join(RESOURCE_ROOT, "crazyones.pdf") + expected = os.path.join( + RESOURCE_ROOT, "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf" + ) + + # Act + merger = PdfMerger(strict=True) + with pytest.warns(PdfReadWarning): + merger.append(base) + merger.merge(1, crazy) + stream = BytesIO() + merger.write(stream) + + # Assert + stream.seek(0) + actual = stream.read() + with open(expected, "rb") as fp: + expected_data = fp.read() + if actual != expected_data: + # See https://github.com/pytest-dev/pytest/issues/9124 + assert ( + False + ), f"len(actual) = {len(actual):,} vs len(expected) = {len(expected_data):,}" + + # Cleanup + merger.close()