From f2ffa7af8f52eed3c37d9d6bc376db9f481b2a03 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Wed, 29 Jun 2022 13:37:35 +0200 Subject: [PATCH] DOC: Compression of content streams (#1040) --- docs/user/file-size.md | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/docs/user/file-size.md b/docs/user/file-size.md index fa56a81ed..dc9d0cd71 100644 --- a/docs/user/file-size.md +++ b/docs/user/file-size.md @@ -27,17 +27,17 @@ with open("smaller-new-file.pdf", "wb") as fp: ``` It depends on the PDF how well this works, but we have seen an 86% file -reduction from 5.7 MB to 0.8 MB within a real PDF. +reduction (from 5.7 MB to 0.8 MB) within a real PDF. ## Remove images ```python -import PyPDF2 +from PyPDF2 import PdfReader, PdfWriter -reader = PyPDF2.PdfReader("example.pdf") -writer = PyPDF2.PdfWriter() +reader = PdfReader("example.pdf") +writer = PdfWriter() for page in reader.pages: writer.add_page(page) @@ -48,18 +48,27 @@ with open("out.pdf", "wb") as f: writer.write(f) ``` -## Compression +## Loss-less Compression + +PyPDF2 supports the FlateDecode filter which uses the zlib/deflate compression +method. It is a loss-less compression, meaning the resulting PDF looks exactly +the same. + +Deflate compression can be applied to a page via [`page.compress_content_streams`](https://pypdf2.readthedocs.io/en/latest/modules/PageObject.html#PyPDF2._page.PageObject.compress_content_streams): ```python -import PyPDF2 +from PyPDF2 import PdfReader, PdfWriter -reader = PyPDF2.PdfReader("example.pdf") -writer = PyPDF2.PdfWriter() +reader = PdfReader("example.pdf") +writer = PdfWriter() for page in reader.pages: - page.compress_content_streams() + page.compress_content_streams() # This is CPU intensive! writer.add_page(page) with open("out.pdf", "wb") as f: writer.write(f) ``` + +Using this method, we have seen a reduction by 70% (from 11.8 MB to 3.5 MB) +with a real PDF.