Skip to content

Commit

Permalink
ENH: Add PageObject._get_fonts (#1083)
Browse files Browse the repository at this point in the history
Add possibility to get names of fonts

See #153

Co-authored-by: tiarno <[email protected]>
  • Loading branch information
MartinThoma and tiarno authored Jul 10, 2022
1 parent 1e4c2c9 commit e51141d
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 0 deletions.
45 changes: 45 additions & 0 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
Iterator,
List,
Optional,
Set,
Tuple,
Union,
cast,
Expand Down Expand Up @@ -1338,6 +1339,18 @@ def extractText(
deprecate_with_replacement("extractText", "extract_text")
return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep)

def _get_fonts(self) -> Tuple[Set[str], Set[str]]:
"""
Get the names of embedded fonts and unembedded fonts.
:return: (Set of embedded fonts, set of unembedded fonts)
"""
obj = self.get_object()
assert isinstance(obj, DictionaryObject)
fonts, embedded = _get_fonts_walk(cast(DictionaryObject, obj["/Resources"]))
unembedded = fonts - embedded
return embedded, unembedded

mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
"""
A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
Expand Down Expand Up @@ -1486,3 +1499,35 @@ def __getitem__(self, index: int) -> PageObject:
def __iter__(self) -> Iterator[PageObject]:
for i in range(len(self)):
yield self[i]


def _get_fonts_walk(
obj: DictionaryObject,
fnt: Optional[Set[str]] = None,
emb: Optional[Set[str]] = None,
) -> Tuple[Set[str], Set[str]]:
"""
If there is a key called 'BaseFont', that is a font that is used in the document.
If there is a key called 'FontName' and another key in the same dictionary object
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
embedded.
We create and add to two sets, fnt = fonts used and emb = fonts embedded.
"""
if fnt is None:
fnt = set()
if emb is None:
emb = set()
if not hasattr(obj, "keys"):
return set(), set()
fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
if "/BaseFont" in obj:
fnt.add(cast(str, obj["/BaseFont"]))
if "/FontName" in obj:
if [x for x in fontkeys if x in obj]: # test to see if there is FontFile
emb.add(cast(str, obj["/FontName"]))

for key in obj.keys():
_get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb)

return fnt, emb # return the sets for each page
55 changes: 55 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,58 @@ def test_extract_text_operator_t_star(): # L1266, L1267
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


@pytest.mark.parametrize(
("pdf_path", "password", "embedded", "unembedded"),
[
(
os.path.join(RESOURCE_ROOT, "crazyones.pdf"),
None,
{
"/HHXGQB+SFTI1440",
"/TITXYI+SFRM0900",
"/YISQAD+SFTI1200",
},
set(),
),
(
os.path.join(RESOURCE_ROOT, "attachment.pdf"),
None,
{
"/HHXGQB+SFTI1440",
"/TITXYI+SFRM0900",
"/YISQAD+SFTI1200",
},
set(),
),
(
os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"),
"openpassword",
{"/BAAAAA+DejaVuSans"},
set(),
),
(
os.path.join(RESOURCE_ROOT, "imagemagick-images.pdf"),
None,
set(),
{"/Helvetica"},
),
(os.path.join(RESOURCE_ROOT, "imagemagick-lzw.pdf"), None, set(), set()),
(
os.path.join(RESOURCE_ROOT, "reportlab-inline-image.pdf"),
None,
set(),
{"/Helvetica"},
),
],
)
def test_get_fonts(pdf_path, password, embedded, unembedded):
reader = PdfReader(pdf_path, password=password)
a = set()
b = set()
for page in reader.pages:
a_tmp, b_tmp = page._get_fonts()
a = a.union(a_tmp)
b = b.union(b_tmp)
assert (a, b) == (embedded, unembedded)

0 comments on commit e51141d

Please sign in to comment.