From 97f36bd097b9c677ef283bc39559b73aea76620b Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 30 Jun 2022 08:42:06 +0200 Subject: [PATCH] MAINT: Handle XML error when reading XmpInformation (#1030) Closes #585 --- PyPDF2/xmp.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/PyPDF2/xmp.py b/PyPDF2/xmp.py index 5583d772a..5cb79c750 100644 --- a/PyPDF2/xmp.py +++ b/PyPDF2/xmp.py @@ -20,6 +20,9 @@ from xml.dom.minidom import Document from xml.dom.minidom import Element as XmlElement from xml.dom.minidom import parseString +from xml.parsers.expat import ExpatError + +from PyPDF2.errors import PdfReadError from ._utils import StreamType, deprecate_with_replacement from .generic import ContentStream, PdfObject @@ -205,11 +208,17 @@ class XmpInformation(PdfObject): """ An object that represents Adobe XMP metadata. Usually accessed by :py:attr:`xmp_metadata()` + + :raises: PdfReadError if XML is invalid """ def __init__(self, stream: ContentStream) -> None: self.stream = stream - doc_root: Document = parseString(self.stream.get_data()) + try: + data = self.stream.get_data() + doc_root: Document = parseString(data) + except ExpatError as e: + raise PdfReadError(f"XML in XmpInformation was invalid: {e}") self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( RDF_NAMESPACE, "RDF" )[0]