Skip to content

Commit

Permalink
fix: Handle invalid XML also in streaming XML parser. Fixes #1200
Browse files Browse the repository at this point in the history
  • Loading branch information
ecederstrand committed Jul 23, 2023
1 parent 9eccab0 commit 7428a34
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 6 deletions.
16 changes: 11 additions & 5 deletions exchangelib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def __init__(self, msg, data):
# Regex of UTF-8 control characters that are illegal in XML 1.0 (and XML 1.1).
# See https://stackoverflow.com/a/22273639/219640
_ILLEGAL_XML_CHARS_RE = re.compile("[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x84\x86-\x9F\uFDD0-\uFDDF\uFFFE\uFFFF]")
_ILLEGAL_XML_ESCAPE_CHARS_RE = re.compile(rb"&(#[0-9]+;?|#[xX][0-9a-fA-F]+;?)") # Could match the above better

# XML namespaces
SOAPNS = "http://schemas.xmlsoap.org/soap/envelope/"
Expand Down Expand Up @@ -268,6 +269,10 @@ def safe_xml_value(value, replacement="?"):
return _ILLEGAL_XML_CHARS_RE.sub(replacement, value)


def sanitize_xml(data, replacement=b"?"):
return _ILLEGAL_XML_ESCAPE_CHARS_RE.sub(replacement, data)


def create_element(name, attrs=None, nsmap=None):
if ":" in name:
ns, name = name.split(":")
Expand Down Expand Up @@ -362,19 +367,20 @@ def parse(self, r):
collected_data = []
while buffer:
if not self.element_found:
collected_data += buffer
collected_data.extend(buffer)
yield from self.feed(buffer)
buffer = file.read(self._bufsize)
# Any remaining data in self.buffer should be padding chars now
self.buffer = None
self.close()
if not self.element_found:
data = bytes(collected_data)
raise ElementNotFound("The element to be streamed from was not found", data=bytes(data))
raise ElementNotFound("The element to be streamed from was not found", data=bytes(collected_data))

def feed(self, data, isFinal=0):
"""Yield the current content of the character buffer."""
DefusedExpatParser.feed(self, data=data, isFinal=isFinal)
"""Yield the current content of the character buffer. The input XML may contain illegal characters. The lxml
parser handles this gracefully with the 'recover' option, but ExpatParser doesn't have this option. Remove
illegal characters before parsing."""
DefusedExpatParser.feed(self, data=sanitize_xml(data), isFinal=isFinal)
return self._decode_buffer()

def _decode_buffer(self):
Expand Down
9 changes: 8 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
DocumentYielder,
ParseError,
PrettyXmlHandler,
StreamingBase64Parser,
chunkify,
get_domain,
get_redirect_url,
Expand Down Expand Up @@ -129,8 +130,9 @@ def test_get_redirect_url(self, m):

def test_to_xml(self):
to_xml(b'<?xml version="1.0" encoding="UTF-8"?><foo></foo>')
to_xml(b'<?xml version="1.0" encoding="UTF-8"?><foo>&broken</foo>')
to_xml(b'<?xml version="1.0" encoding="UTF-8"?><foo>&#x13;</foo>')
to_xml(BOM_UTF8 + b'<?xml version="1.0" encoding="UTF-8"?><foo></foo>')
to_xml(BOM_UTF8 + b'<?xml version="1.0" encoding="UTF-8"?><foo>&broken</foo>')
with self.assertRaises(ParseError):
to_xml(b"foo")

Expand Down Expand Up @@ -166,6 +168,11 @@ def test_xml_to_str(self):
with self.assertRaises(AttributeError):
xml_to_str("XXX", encoding=None, xml_declaration=True)

def test_streaming_parser(self):
StreamingBase64Parser().feed(b"<Name>SomeName.png</Name>", 1)
# Test that we can handle invalid chars in the streaming parser
StreamingBase64Parser().feed(b"<Name>SomeName&#x13;.png</Name>", 1)

def test_anonymizing_handler(self):
h = AnonymizingXmlHandler(forbidden_strings=("XXX", "yyy"))
self.assertEqual(
Expand Down

0 comments on commit 7428a34

Please sign in to comment.