Skip to content

Commit

Permalink
ENH: Add PdfReader.xfa attribute (#1026)
Browse files Browse the repository at this point in the history
Closes #408

Co-authored-by: George Alverson <[email protected]>
  • Loading branch information
MartinThoma and georgealverson authored Jul 3, 2022
1 parent 4c43c0e commit 0e18938
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 0 deletions.
25 changes: 25 additions & 0 deletions PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import re
import struct
import warnings
import zlib
from io import BytesIO
from pathlib import Path
from typing import (
Expand Down Expand Up @@ -1638,6 +1639,30 @@ def isEncrypted(self) -> bool: # pragma: no cover
deprecate_with_replacement("isEncrypted", "is_encrypted")
return self.is_encrypted

@property
def xfa(self) -> Optional[Dict[str, Any]]:
tree: Optional[TreeObject] = None
retval: Dict[str, Any] = {}
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])

if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
return None

tree = cast(TreeObject, catalog["/AcroForm"])

if "/XFA" in tree:
fields = cast(ArrayObject, tree["/XFA"])
i = iter(fields)
for f in i:
tag = f
f = next(i)
if isinstance(f, IndirectObject):
field = cast(Optional[EncodedStreamObject], f.get_object())
if field:
es = zlib.decompress(field._data)
retval[tag] = es
return retval


class PdfFileReader(PdfReader): # pragma: no cover
def __init__(self, *args: Any, **kwargs: Any) -> None:
Expand Down
26 changes: 26 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,32 @@ def test_unexpected_destination():
assert exc.value.args[0] == "Unexpected destination '/1'"


@pytest.mark.parametrize(
"src",
[
(os.path.join(RESOURCE_ROOT, "crazyones.pdf")),
(os.path.join(RESOURCE_ROOT, "commented.pdf")),
],
)
def test_xfa(src):
reader = PdfReader(src)
assert reader.xfa is None


def test_xfa_non_empty():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/942/942050.pdf"
name = "tika-942050.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
assert list(reader.xfa.keys()) == [
"preamble",
"config",
"template",
"PDFSecurity",
"datasets",
"postamble",
]


@pytest.mark.parametrize(
"src,pdf_header",
[
Expand Down

0 comments on commit 0e18938

Please sign in to comment.