From e51141d7ed735703bb07f5ffa7e5d2f4d9a79347 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 10 Jul 2022 15:53:04 +0200 Subject: [PATCH] ENH: Add PageObject._get_fonts (#1083) Add possibility to get names of fonts See #153 Co-authored-by: tiarno --- PyPDF2/_page.py | 45 +++++++++++++++++++++++++++++++++++++ tests/test_page.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 94cfb952c..0f0017241 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -39,6 +39,7 @@ Iterator, List, Optional, + Set, Tuple, Union, cast, @@ -1338,6 +1339,18 @@ def extractText( deprecate_with_replacement("extractText", "extract_text") return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep) + def _get_fonts(self) -> Tuple[Set[str], Set[str]]: + """ + Get the names of embedded fonts and unembedded fonts. + + :return: (Set of embedded fonts, set of unembedded fonts) + """ + obj = self.get_object() + assert isinstance(obj, DictionaryObject) + fonts, embedded = _get_fonts_walk(cast(DictionaryObject, obj["/Resources"])) + unembedded = fonts - embedded + return embedded, unembedded + mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) """ A :class:`RectangleObject`, expressed in default user space units, @@ -1486,3 +1499,35 @@ def __getitem__(self, index: int) -> PageObject: def __iter__(self) -> Iterator[PageObject]: for i in range(len(self)): yield self[i] + + +def _get_fonts_walk( + obj: DictionaryObject, + fnt: Optional[Set[str]] = None, + emb: Optional[Set[str]] = None, +) -> Tuple[Set[str], Set[str]]: + """ + If there is a key called 'BaseFont', that is a font that is used in the document. + If there is a key called 'FontName' and another key in the same dictionary object + that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is + embedded. + + We create and add to two sets, fnt = fonts used and emb = fonts embedded. + """ + if fnt is None: + fnt = set() + if emb is None: + emb = set() + if not hasattr(obj, "keys"): + return set(), set() + fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") + if "/BaseFont" in obj: + fnt.add(cast(str, obj["/BaseFont"])) + if "/FontName" in obj: + if [x for x in fontkeys if x in obj]: # test to see if there is FontFile + emb.add(cast(str, obj["/FontName"])) + + for key in obj.keys(): + _get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb) + + return fnt, emb # return the sets for each page diff --git a/tests/test_page.py b/tests/test_page.py index 987cf057a..65366459e 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -266,3 +266,58 @@ def test_extract_text_operator_t_star(): # L1266, L1267 reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) for page in reader.pages: page.extract_text() + + +@pytest.mark.parametrize( + ("pdf_path", "password", "embedded", "unembedded"), + [ + ( + os.path.join(RESOURCE_ROOT, "crazyones.pdf"), + None, + { + "/HHXGQB+SFTI1440", + "/TITXYI+SFRM0900", + "/YISQAD+SFTI1200", + }, + set(), + ), + ( + os.path.join(RESOURCE_ROOT, "attachment.pdf"), + None, + { + "/HHXGQB+SFTI1440", + "/TITXYI+SFRM0900", + "/YISQAD+SFTI1200", + }, + set(), + ), + ( + os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"), + "openpassword", + {"/BAAAAA+DejaVuSans"}, + set(), + ), + ( + os.path.join(RESOURCE_ROOT, "imagemagick-images.pdf"), + None, + set(), + {"/Helvetica"}, + ), + (os.path.join(RESOURCE_ROOT, "imagemagick-lzw.pdf"), None, set(), set()), + ( + os.path.join(RESOURCE_ROOT, "reportlab-inline-image.pdf"), + None, + set(), + {"/Helvetica"}, + ), + ], +) +def test_get_fonts(pdf_path, password, embedded, unembedded): + reader = PdfReader(pdf_path, password=password) + a = set() + b = set() + for page in reader.pages: + a_tmp, b_tmp = page._get_fonts() + a = a.union(a_tmp) + b = b.union(b_tmp) + assert (a, b) == (embedded, unembedded)